In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

# ==========================================================
# 1. LOAD DATA (Google Colab Upload)
# ==========================================================

from google.colab import files
uploaded = files.upload()

df = pd.read_csv('auto-mpg.csv')

# ==========================================================
# 2. DATA PREPROCESSING
# ==========================================================

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert horsepower to numeric
df['horsepower'] = pd.to_numeric(df['horsepower'])

# Drop missing values
df = df.dropna()

# Remove non-numeric column if exists
if 'car name' in df.columns:
    df = df.drop(columns=['car name'])

# ==========================================================
# 3. DEFINE FEATURES AND TARGET
# ==========================================================

X = df.drop(columns=['mpg'])
y = df['mpg']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================================================
# 4. FORWARD FEATURE SELECTION (Using Cross-Validation)
# ==========================================================

remaining_features = list(X.columns)
selected_features = []
best_score = -np.inf

print("Forward Feature Selection Process:\n")

while remaining_features:

    scores = []

    for feature in remaining_features:

        # Try adding one feature at a time
        features_to_test = selected_features + [feature]

        model = LinearRegression()

        # 5-fold Cross-Validation on training data
        cv_scores = cross_val_score(
            model,
            X_train[features_to_test],
            y_train,
            cv=5,
            scoring='r2'
        )

        score = np.mean(cv_scores)
        scores.append((score, feature))

    # Sort by best RÂ² score
    scores.sort(reverse=True)
    current_best_score, best_feature = scores[0]

    # Add feature only if performance improves
    if current_best_score > best_score:
        best_score = current_best_score
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)

        print(f"Added: {best_feature}, CV R2: {best_score:.4f}")
    else:
        break

# ==========================================================
# 5. FINAL MODEL EVALUATION (ON TEST SET)
# ==========================================================

final_model = LinearRegression()
final_model.fit(X_train[selected_features], y_train)

y_pred = final_model.predict(X_test[selected_features])
final_r2 = r2_score(y_test, y_pred)

print("\nFinal Selected Features:")
print(selected_features)

print(f"\nFinal Test R2 Score: {final_r2:.4f}")


Saving auto-mpg.csv to auto-mpg (1).csv
Forward Feature Selection Process:

Added: weight, CV R2: 0.6962
Added: model year, CV R2: 0.8037
Added: origin, CV R2: 0.8166

Final Selected Features:
['weight', 'model year', 'origin']

Final Test R2 Score: 0.7828
