# ***Model Training Using Advanced Machine Learning Models***

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error,mean_absolute_error,r2_score

In [2]:
# load preprocessed data set
if os.path.exists('../data/preprocessed_BMW_sales_data.csv'):
    df = pd.read_csv('../data/preprocessed_BMW_sales_data.csv')
else:
    raise FileNotFoundError('File Not Found! Please check file path and try again!')

In [3]:
# target output
y = df['Price_USD']

#input features
x = df.drop(columns='Price_USD').copy()

***`Baseline Models`***

In [None]:
# split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size= 0.3
)

numeric_cols = x_train.select_dtypes(include='number').columns
categorical_cols = x_train.select_dtypes(include='object').columns

# preprocess data
preprocessor = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(),numeric_cols),
    ('onehot',OneHotEncoder(handle_unknown='ignore'),categorical_cols)
])

# cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# baseline models
names_model = {
    'linear_regression' : LinearRegression(),
    'ridge' : Ridge(random_state=42),
    'lasso' : Lasso(random_state=42),
    'decision_tree' : DecisionTreeRegressor(random_state=42),
    'random_forest' : RandomForestRegressor(random_state=42),
    'xgboost' : XGBRegressor(random_state=42)
}

results = {}
for names, models in names_model.items():
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',models)
    ])

    # model training
    scores = cross_val_score(pipe,x_train,y_train,cv=cv,
                            scoring= 'neg_mean_squared_error',n_jobs=-1)

    # store results
    rmse = np.sqrt(-scores)
    results[names] = {
        'RMSE scores across fold' : rmse,
        'Mean RMSE score' : np.mean(rmse),
        'Std RMSE score' : np.std(rmse)
    }

# print out results
for name, result in results.items():
        print(f"Model: {name}")
        print(f"RMSE scores across folds: {result['RMSE scores across fold']}")
        print(f"Mean RMSE: {result['Mean RMSE score']:.2f}")
        print(f"Std RMSE: {result['Std RMSE score']:.2f}")
        print(f'RMSE : {root_mean_squared_error()}')
        print('-'*50)


Model: linear_regression
RMSE scores across folds: [25974.20386338 26084.20045834 26356.56067719 25962.64752245
 25840.90575076]
Mean RMSE: 26043.70
Std RMSE: 174.38
--------------------------------------------------
Model: ridge
RMSE scores across folds: [25974.19845611 26084.19248159 26356.55424826 25962.64219839
 25840.89971133]
Mean RMSE: 26043.70
Std RMSE: 174.38
--------------------------------------------------
Model: lasso
RMSE scores across folds: [25974.02387605 26083.79566351 26356.40749646 25962.39878906
 25840.55086065]
Mean RMSE: 26043.44
Std RMSE: 174.43
--------------------------------------------------
Model: decision_tree
RMSE scores across folds: [36952.1943596  37112.27725816 37141.67817411 37656.92436028
 37222.78824451]
Mean RMSE: 37217.17
Std RMSE: 236.78
--------------------------------------------------
Model: random_forest
RMSE scores across folds: [26309.46005886 26383.82782283 26674.39530204 26412.55691328
 26277.71342615]
Mean RMSE: 26411.59
Std RMSE: 140.1

***`Hyper-parameter Tuning and ensemble methods`***

In [None]:
models_and_params = {
    'Ridge' : {
        'model' : Ridge(random_state=42),
        'params' : {
            'model__max_iter' : [1000,2000,3000],
            'model__alpha' : [0.1,0.5,1,5]
        }
    },
    'Lasso' : {
        'model' : Lasso(random_state=42),
        'params' : {
            'model__max_iter' : [1000,2000,3000],
            'model__alpha' : [0.1,0.5,1,5]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'params' : {
            'model__n_estimators' : [80,100,120],
            'model__max_depth' : [2,4,6,None],
            'model__min_samples_split' : [2,4,6,8],
            'model__min_samples_leaf' : [1,3,5,7]
        }
    },
    'XGBRegressor' : {
        'model' : XGBRegressor(objective='reg:squarederror',random_state = 42),
        'params' : {
            'model__n_estimators' : [80,100,120],
            'model__max_depth' : [2,4,6],
            'model__learning_rate' : [0.1,0.5,1,5],
            'model__reg_alpha' : [0.1,0.5,1,5] 
        }
    }
}

results = {}
for name,models in models_and_params.items():
    print('Training Model ...',name)
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',models['model'])
    ])

    gs = GridSearchCV(
        estimator=pipe,
        param_grid=models['params'],
        cv = cv,
        refit=True,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    )

    # model training
    gs.fit(x_train,y_train)

    results[name] = {
        'Best Score' : gs.best_score_,
        'Best params' : gs.best_params_
    }

    print('-'*50)

for name, result in results.items():
    print('Model : ',name)
    print('Best Score : ',result['Best Score'])
    print('Best Parameters : ',result['Best params'])