# ***Model Training Using Advanced Machine Learning Models***

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import json
import joblib

In [2]:
# load preprocessed data set
if os.path.exists('../data/preprocessed_BMW_sales_data.csv'):
    df = pd.read_csv('../data/preprocessed_BMW_sales_data.csv')
else:
    raise FileNotFoundError('File Not Found! Please check file path and try again!')

In [3]:
# target output
y = np.log1p(df['Price_USD'])

#input features
x = df.drop(columns='Price_USD').copy()

🎢 ***`Baseline Models`***

In [4]:
# split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size= 0.3
)
x_train.to_csv('../data/x_train.csv',index=False)
x_test.to_csv('../data/x_test.csv',index=False)
np.expm1(y_train).to_csv('../data/y_train.csv', index= False)
np.expm1(y_test).to_csv('../data/y_test.csv', index=False)

In [5]:
numeric_cols = x_train.select_dtypes(include='number').columns
categorical_cols = x_train.select_dtypes(include='object').columns

# preprocess data
preprocessor = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(),numeric_cols),
    ('onehot',OneHotEncoder(handle_unknown='ignore'),categorical_cols)
],verbose_feature_names_out=False)

# cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [6]:

# baseline models
names_model = {
    'linear_regression' : LinearRegression(),
    'ridge' : Ridge(random_state=42),
    'lasso' : Lasso(random_state=42),
    'decision_tree' : DecisionTreeRegressor(random_state=42),
    'random_forest' : RandomForestRegressor(random_state=42),
    'xgboost' : XGBRegressor(random_state=42)
}

results = {}
for names, models in names_model.items():
    print(f'Training {names}. This May Take A While ...')
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',models)
    ])

    # model training
    scores = cross_val_score(pipe,x_train,y_train,cv=cv,
                            scoring='neg_mean_squared_error',n_jobs=-1,verbose=2)

    # store results
    rmse = np.sqrt(-scores)
    results[names] = {
        'RMSE scores across fold' : rmse,
        'Mean RMSE score' : np.mean(rmse),
        'Std RMSE score' : np.std(rmse)
    }

    # with open('../models/baseline_results.json','w') as file:
    #     json.dump(results,file,indent=4)

    print('-'*50)
# print out results
for name, result in results.items():
        print(f"Model: {name}")
        print(f"RMSE scores across folds: {result['RMSE scores across fold']}")
        print(f"Mean RMSE: {result['Mean RMSE score']:.2f}")
        print(f"Std RMSE: {result['Std RMSE score']:.2f}")
        print('-'*50)


Training linear_regression. This May Take A While ...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.7s remaining:   14.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training ridge. This May Take A While ...


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training lasso. This May Take A While ...


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training decision_tree. This May Take A While ...


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.0s remaining:   18.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training random_forest. This May Take A While ...


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 18.7min remaining: 28.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 18.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training xgboost. This May Take A While ...
--------------------------------------------------
Model: linear_regression
RMSE scores across folds: [0.3836163  0.38233811 0.38135026 0.38298339 0.38033372]
Mean RMSE: 0.38
Std RMSE: 0.00
--------------------------------------------------
Model: ridge
RMSE scores across folds: [0.38361618 0.38233801 0.3813502  0.38298332 0.38033363]
Mean RMSE: 0.38
Std RMSE: 0.00
--------------------------------------------------
Model: lasso
RMSE scores across folds: [0.38333273 0.38214277 0.38131874 0.38275934 0.38026021]
Mean RMSE: 0.38
Std RMSE: 0.00
--------------------------------------------------
Model: decision_tree
RMSE scores across folds: [0.54720111 0.5399726  0.54257667 0.55567281 0.55082695]
Mean RMSE: 0.55
Std RMSE: 0.01
--------------------------------------------------
Model: random_forest
RMSE scores across folds: [0.38977621 0.38784368 0.38679242 0.38858635 0.38565077]
Mean RMSE: 0.39
St

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished


***`Hyper-parameter Tuning and ensemble methods`***

In [7]:
models_and_params = {
    'Ridge' : {
        'model' : Ridge(random_state=42),
        'params' : {
            'model__max_iter' : [1000,2000],
            'model__alpha' : [0.1,0.5,1]
        }
    },
    'Lasso' : {
        'model' : Lasso(random_state=42),
        'params' : {
            'model__max_iter' : [1000,200],
            'model__alpha' : [0.1,0.5,1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'params' : {
            'model__n_estimators' : [100,120],
            'model__max_depth' : [2,4,None],
            'model__min_samples_split' : [2,4],
            'model__min_samples_leaf' : [3,5]
        }
    },
    'XGBRegressor' : {
        'model' : XGBRegressor(objective='reg:squarederror',random_state = 42),
        'params' : {
            'model__n_estimators' : [100,120],
            'model__max_depth' : [2,4,6],
            'model__learning_rate' : [0.1,0.5],
            'model__reg_alpha' : [0.1,0.5] 
        }
    }
}

model_name = None
model_estimator = None
model_best_score = -float('inf')

results = {}
for name,models in models_and_params.items():
    print('Training Model ...',name)
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',models['model'])
    ])

    gs = GridSearchCV(
        estimator=pipe,
        param_grid=models['params'],
        cv = cv,
        refit=True,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    )

    # model training
    gs.fit(x_train,y_train)

    results[name] = {
        'Best Score' : gs.best_score_,
        'Best params' : gs.best_params_
    }

    if gs.best_score_ > model_best_score:
        model_best_score = gs.best_score_
        model_name = name
        model_estimator = gs.best_estimator_

    joblib.dump(model_estimator,f'../models/{name}_best_estimator.pkl')

    with open('model_result.json','w') as file:
        json.dump(results,file,indent=4)

    print('-'*50)

for name, result in results.items():
    print('Model : ',name)
    print('Best Score : ',result['Best Score'])
    print('Best Parameters : ',result['Best params'])

Training Model ... Ridge
--------------------------------------------------
Training Model ... Lasso
--------------------------------------------------
Training Model ... Random Forest
--------------------------------------------------
Training Model ... XGBRegressor
--------------------------------------------------
Model :  Ridge
Best Score :  -0.14602032047103355
Best Parameters :  {'model__alpha': 1, 'model__max_iter': 1000}
Model :  Lasso
Best Score :  -0.14589671903500806
Best Parameters :  {'model__alpha': 0.1, 'model__max_iter': 1000}
Model :  Random Forest
Best Score :  -0.14592855371351746
Best Parameters :  {'model__max_depth': 2, 'model__min_samples_leaf': 3, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Model :  XGBRegressor
Best Score :  -0.14613778508159153
Best Parameters :  {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__n_estimators': 100, 'model__reg_alpha': 0.1}
