# Model Comparison and Selection
This notebook loads the dataset, trains multiple regression models, compares their performance (R2 Score, MSE), and selects the best one.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load Data
df = pd.read_csv('auto-mpg.csv')

# Preprocessing
df['horsepower'] = df['horsepower'].replace('?', np.nan)
df = df.dropna()
df['horsepower'] = df['horsepower'].astype(float)
df = df.drop('car name', axis=1)

X = df.drop('mpg', axis=1)
y = df['mpg']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define Models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'BayesianRidge': BayesianRidge(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor()
}

# Store Results
results = []

# Train and Evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({'Model': name, 'MSE': mse, 'R2': r2})
    print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")

In [None]:
# Create DataFrame of Results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='R2', ascending=False)
print("\nModel Ranking:")
print(results_df)

In [None]:
# Select Best Model
best_model_name = results_df.iloc[0]['Model']
print(f"\nBest Model: {best_model_name}")

best_model = models[best_model_name]
best_model.fit(X, y) # Retrain on full data if desired, or just use trained version

# Save Best Model
import os
os.makedirs('../models', exist_ok=True)
pickle.dump(best_model, open('../models/best_regression_model.pkl', 'wb'))
print("Best model saved to models/best_regression_model.pkl")

In [None]:
# Plot R2 Scores
plt.figure(figsize=(12, 6))
sns.barplot(x='R2', y='Model', data=results_df, palette='viridis')
plt.xlabel('R2 Score')
plt.title('Regression Model Comparison')
plt.show()