In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sqlalchemy import create_engine
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import numpy as np
import sys, os
this_path = '/home/ibi/Documents/GitHub/ady201m-project'
sys.path.append(this_path)
os.chdir(this_path)

In [None]:
# engine = create_engine('sqlite:///src/db/vehicles.db')
# df = pd.read_sql_query('SELECT * FROM vehicles', engine)

df = pd.read_csv('/kaggle/input/cars-data/train-data-cleaned.csv')
df.head()

In [None]:
X = df.drop(columns=['Name', 'Price'])
y = df['Price']

X.head(), y.head()

In [None]:
numerical_columns = ['Year', 'Kilometers_Driven', 'Mileage(kmpl)', 'Engine (CC)', 'Power (bhp)', 'Seats']
categorical_columns = ['Automaker', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

models = {
    'LinearRegression': (LinearRegression(), {}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'model__max_depth': [3, 5, 7]}),
    'RandomForestRegressor': (RandomForestRegressor(), {'model__n_estimators': [50, 100, 200], 'model__max_depth': [3, 6, 9]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.01, 0.1, 0.2]}),
    'SVR': (SVR(), {'model__C': [0.1, 1, 10], 'model__gamma': ['scale', 'auto']}),
    'KNeighborsRegressor': (KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    'XGBRegressor': (XGBRegressor(), {'model__n_estimators': [50, 100, 200], 'model__max_depth': [3, 6, 9], 'model__learning_rate': [0.01, 0.1, 0.2]}),
}

In [None]:
def build_preprocessor(selected_features):
    numerical_features = [col for col in selected_features if col in numerical_columns]
    categorical_features = [col for col in selected_features if col in categorical_columns]
    
    return ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

In [None]:
# Evaluating models
feature_list = numerical_columns + categorical_columns
results = {model_name: [] for model_name in models.keys()}
best_estimators = {}
metrics = ['r2_score', 'mean_absolute_error', 'mean_squared_error', 'root_mean_squared_error']
metrics_results = {metric: {model_name: [] for model_name in models.keys()} for metric in metrics}

for i in range(1, len(feature_list) + 1):
    selected_features = feature_list[:i]
    X_selected = X[selected_features]
    
    print(f"Selected features ({i}): {selected_features}")
    
    preprocessor = build_preprocessor(selected_features)
    
    for model_name, (model, params) in tqdm(models.items(), desc=f"Evaluating models with {i} features"):
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('selector', SelectKBest(score_func=f_regression, k='all')),
                                   ('model', model)])
        
        if params:
            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2', n_jobs=-1)
            grid_search.fit(X_selected, y)
            best_model = grid_search.best_estimator_
            best_score = grid_search.best_score_
            best_estimators[model_name] = best_model
            print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        else:
            X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            best_score = r2_score(y_test, y_pred)
            best_estimators[model_name] = pipeline

        results[model_name].append(best_score)
        metrics_results['r2_score'][model_name].append(best_score)
        metrics_results['mean_absolute_error'][model_name].append(mean_absolute_error(y_test, y_pred))
        mse = mean_squared_error(y_test, y_pred)
        metrics_results['mean_squared_error'][model_name].append(mse)
        metrics_results['root_mean_squared_error'][model_name].append(np.sqrt(mse))


In [None]:
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

best_model_name = max(results, key=lambda k: max(results[k]))
with open(f'best_model_{best_model_name}.pkl', 'wb') as f:
    pickle.dump(best_estimators[best_model_name], f)


In [None]:
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Mean R²': [pd.Series(scores).mean() for scores in results.values()],
    'Max R²': [pd.Series(scores).max() for scores in results.values()],
    'Mean Absolute Error': [pd.Series(metrics_results['mean_absolute_error'][model_name]).mean() for model_name in results.keys()],
    'Mean Squared Error': [pd.Series(metrics_results['mean_squared_error'][model_name]).mean() for model_name in results.keys()],
    'Root Mean Squared Error': [pd.Series(metrics_results['root_mean_squared_error'][model_name]).mean() for model_name in results.keys()]
})

print("Results dataframe:")
results_df

In [None]:
plt.figure(figsize=(14, 8))
for model_name, r2_scores in results.items():
    plt.plot(range(1, len(feature_list) + 1), r2_scores, label=model_name)

plt.xlabel('Number of Features')
plt.ylabel('R² Score')
plt.title('Model Performance with Increasing Number of Features')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
best_model = best_estimators[best_model_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred = best_model.predict(X_test)

plt.figure(figsize=(14, 8))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Real Price')
plt.ylabel('Predicted Price')
plt.title('Real vs Predicted Price for XGBoost Regressor')

plt.show()