In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

# ==========================================================
# 1. LOAD DATA (Colab Upload)
# ==========================================================

from google.colab import files
uploaded = files.upload()

df = pd.read_csv("auto-mpg.csv")

# Data cleaning
df.replace('?', np.nan, inplace=True)
df['horsepower'] = pd.to_numeric(df['horsepower'])
df = df.dropna()

if 'car name' in df.columns:
    df = df.drop(columns=['car name'])

# ==========================================================
# 2. DEFINE FEATURES AND TARGET
# ==========================================================

X = df.drop(columns=['mpg'])
y = df['mpg']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================================================
# 3. BACKWARD ELIMINATION (Using Cross-Validation)
# ==========================================================

selected_features = list(X.columns)
best_score = -np.inf

print("Backward Feature Elimination Process:\n")

while len(selected_features) > 1:

    scores = []

    for feature in selected_features:

        features_to_test = selected_features.copy()
        features_to_test.remove(feature)

        model = LinearRegression()

        # 5-fold cross-validation on training data
        cv_scores = cross_val_score(
            model,
            X_train[features_to_test],
            y_train,
            cv=5,
            scoring='r2'
        )

        score = np.mean(cv_scores)
        scores.append((score, feature))

    scores.sort(reverse=True)
    current_best_score, worst_feature = scores[0]

    if current_best_score > best_score:
        best_score = current_best_score
        selected_features.remove(worst_feature)
        print(f"Removed: {worst_feature}, CV R2: {best_score:.4f}")
    else:
        break

# ==========================================================
# 4. FINAL MODEL EVALUATION (ON TEST SET)
# ==========================================================

final_model = LinearRegression()
final_model.fit(X_train[selected_features], y_train)

y_pred = final_model.predict(X_test[selected_features])
final_r2 = r2_score(y_test, y_pred)

print("\nFinal Selected Features:")
print(selected_features)

print(f"\nFinal Test R2 Score: {final_r2:.4f}")


Saving auto-mpg.csv to auto-mpg (1).csv
Backward Feature Elimination Process:

Removed: acceleration, CV R2: 0.8129
Removed: horsepower, CV R2: 0.8147
Removed: cylinders, CV R2: 0.8152
Removed: displacement, CV R2: 0.8166

Final Selected Features:
['weight', 'model year', 'origin']

Final Test R2 Score: 0.7828


In [None]:
_