<div align="center" style="font-family: 'Consolas', monospace;"><h1> Model Training For Car Price Predictor </h1> </div>

<p align = "center" style="font-family: 'Consolas', monospace;"> The purpose of this notebook is to train a suitiable regression model, i will regularize the data, apply scaling and build a pipeline and finally find the best model</p>

<br><ul> <li style="font-family: 'Consolas', monospace;">Importing Necessary Libraries</li></ul>

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

<ul> <li style="font-family: 'Consolas', monospace;">Loading and Preparing Data</li></ul>

In [7]:
df = pd.read_csv('data/Cleaned_Test_Set.csv')
df.head()

X = df.drop('price', axis=1)
y = df['price']

<ul> <li style="font-family: 'Consolas', monospace;">Build Pipeline</li></ul>

In [8]:
#Defining Variables
categories = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
               'enginetype', 'fuelsystem', 'cylindernumber','doornumber',
               'Company']
numericals = [col for col in X.columns if col not in categories]

#preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numericals),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categories)
    ])

#building ML pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
    ])

<ul> <li style="font-family: 'Consolas', monospace;">Training Using GridSearch (Multiple Algorithims)</li></ul>

In [None]:
param_grid = [
    
    # ElasticNet (Hybrid Lasso & Ridge)
    {
        "regressor": [ElasticNet(random_state=42)],
        "regressor__alpha": np.logspace(-3, 3, 13),  # Regularization strength
        "regressor__l1_ratio": [0.1, 0.5, 0.9]  # Balance between L1 (Lasso) & L2 (Ridge)
    },
    
    # Random Forest Regressor
    {
        "regressor": [RandomForestRegressor(random_state=42)],
        "regressor__n_estimators": [50, 100, 200],  # Number of trees
        "regressor__max_depth": [None, 10, 20],  # Maximum depth of trees
        "regressor__min_samples_split": [2, 5, 10]  # Minimum samples to split a node
    },
    
    # Gradient Boosting Regressor
    {
        "regressor": [GradientBoostingRegressor(random_state=42)],
        "regressor__n_estimators": [50, 100, 200],  # Number of boosting stages
        "regressor__learning_rate": [0.01, 0.1, 0.2],  # Shrinkage factor
        "regressor__max_depth": [3, 5, 10]  # Tree depth
    },
    
    # Support Vector Regression (SVR) - Non-linear Regression
    {
        "regressor": [SVR()],
        "regressor__kernel": ["linear", "rbf"],  # Linear for simple, RBF for non-linear
        "regressor__C": np.logspace(-2, 2, 5),  # Regularization parameter
        "regressor__epsilon": [0.01, 0.1, 0.5]  # Tolerance for error
    },
    
    # Linear SVR (Linear Version of SVR)
    {
        "regressor": [LinearSVR(max_iter=10000)],
        "regressor__C": np.logspace(-2, 2, 5),  # Regularization parameter
        "regressor__epsilon": [0.01, 0.1, 0.5]  # Tolerance for error
    },
    
    # K-Nearest Neighbors Regressor (KNN)
    {
        "regressor": [KNeighborsRegressor()],
        "regressor__n_neighbors": [3, 5, 10],  # Number of neighbors
        "regressor__weights": ["uniform", "distance"],  # Distance-based weighting
        "regressor__p": [1, 2]  # 1 = Manhattan (L1), 2 = Euclidean (L2)
    },
    {
        'regressor' : [Ridge()],
        'regressor__alpha': np.logspace(-5, 5, 13) # Apha from 10^-5 to 10^5
    },
    {
        'regressor' : [Lasso()],
        'regressor__alpha': np.logspace(-5, 5, 13) # Apha from 10^-5 to 10^5
    }
]

model = GridSearchCV (pipeline,
                      param_grid,
                      cv=7,
                      n_jobs=-1)

model.fit(X, y)

<ul> <li style="font-family: 'Consolas', monospace;">Saving the best model</li></ul>

In [None]:
import joblib

# Save the best model from GridSearchCV
joblib.dump(model.best_estimator_, "models/model.pkl")
print("Model saved successfully!")