<a href="https://colab.research.google.com/github/JONICK277/ML/blob/main/code/model_evaluation/model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Initialization***



## Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

## Data Download

In [None]:
# Execute this if you are running the notebook in Google Colab
!git clone https://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@github.com/JONICK277/ML.git
train_cleaned = pd.read_pickle("ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("ML/data/cleaned/test/test_cleaned.pkl")

In [None]:
# Load the cleaned data
train_cleaned = pd.read_pickle("../../data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("../../data/cleaned/test/test_cleaned.pkl")
test_cleaned_big = pd.read_pickle("../../data/cleaned/test/test_cleaned_no_corr.pkl")

# ***Preparation***

In [None]:
target = "LAID_UP_TIME"
X = train_cleaned.drop(columns=[target])
y = train_cleaned[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Run this for training with the whole dataset

target = "LAID_UP_TIME"
X_train = train_cleaned.drop(columns=[target])
y_train = train_cleaned[target]
X_val = test_cleaned

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# ***Models***

## Random Forest Regressor


In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 800, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 70, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
    'n_estimators': [500, 700, 800, 1000],
    'max_features': ['sqrt'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10],
    'bootstrap': [True, False]
}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30],
 'max_features': ['sqrt'],
 'min_samples_leaf': [2, 5, 10],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [500, 700, 800, 1000]}


In [None]:
# Initialize the model
model_forest = RandomForestRegressor()

In [None]:
rf_random = RandomizedSearchCV(estimator = model_forest, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
print("Best parameters:", rf_random.best_params_)

Best parameters: {'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}


In [None]:
best_model_forest = rf_random.best_estimator_

In [None]:
# Save the best model to a file
with open('best_model_forest.pkl', 'wb') as f:
    pickle.dump(best_model_forest, f)

### RF with optimal hyperparameters

In [None]:
with open('best_model_forest.pkl', 'rb') as f:
    best_model_forest = pickle.load(f)

In [None]:
# RUN THIS IF YOU JUST WANT TO TEST THE RMSE
y_pred = best_model_forest.predict(X_val)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- CHASSIS_NUMBER
- LAID_UP_TIME


In [None]:
# RUN THIS FOR PREDICTING THE TEST DATASET

test_cleaned_copy = test_cleaned.copy()

chassis_number = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])

y_pred = best_model_forest.predict(test_cleaned_copy)

result = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number,
    'LAID_UP_TIME': y_pred
})

In [None]:
try:
    with open("../../results/teamB-model2.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("../../results/teamB-model2.xlsx", index=False)

In [None]:
# Run if you are running the notebook in Google Colab
try:
    with open("ML/results/teamB-model1.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("ML/results/teamB-model1.xlsx", index=False)

In [None]:
rmse = root_mean_squared_error(y_val, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 35.62983353012489


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
forest_scores = cross_val_score(best_model_forest, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [42.2193684  41.62191829 39.98643414 41.14465219 39.12568987 40.34796485
 41.78455325 40.85421408 40.05245031 41.01393571]
Mean: 40.81511811025712
Standard deviation: 0.8973894781711803


In [None]:
# identify the most important features
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model_forest.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                        Feature  Importance
22      SCALED_INVENTURAL_VALUE    0.154952
23           PURCHASE_DATE_year    0.090250
24          PURCHASE_DATE_month    0.082470
25   PURCHASE_BOOKING_DATE_year    0.068587
26  PURCHASE_BOOKING_DATE_month    0.065321
18              COMMISSION_TYPE    0.043661
1                        OFFICE    0.043165
20            AT_LOCATION_SINCE    0.039328
21                  MILAGE_SALE    0.038270
4                       MILEAGE    0.036988
6                         COLOR    0.036941
7                   ENGINE_TYPE    0.036654
5                  MILAGE_SALES    0.034257
19              PURCHASE_MILAGE    0.030678
12            YEAR_CONSTRUCTION    0.027328
0                       COMPANY    0.026386
11                  CURB_WEIGHT    0.026183
8             TRANSMISSION_TYPE    0.024283
3                  MANUFACTURER    0.020943
2             OFFICE_MAIN_BRAND    0.015825
17             VEHICLE_MODEL_ID    0.015050
15                    FUEL_TYPE 

## XGBOOST


In [None]:



model_xboost = XGBRegressor(tree_method='hist', device= "cuda", random_state=42)
random_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [3, 5, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xg_random = RandomizedSearchCV(
    estimator=model_xboost,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
xg_random.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
print("Best parameters:", xg_random.best_params_)

Best parameters: {'subsample': 0.8, 'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [None]:
best_model_xg = xg_random.best_estimator_

In [None]:
# Save the best model to a file
with open('best_model_xgboost.pkl', 'wb') as f:
    pickle.dump(best_model_xg, f)

### XG Boost with optimal params

In [None]:
with open('best_model_xgboost.pkl', 'rb') as f:
    best_model_xg = pickle.load(f)

In [None]:
# RUN THIS IF YOU JUST WANT TO TEST THE RMSE
y_pred = best_model_xg.predict(X_val)

In [None]:
# RUN THIS FOR PREDICTING THE TEST DATASET

test_cleaned_copy = test_cleaned.copy()

chassis_number = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])

y_pred = best_model_xg.predict(test_cleaned_copy)

result = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number,
    'LAID_UP_TIME': y_pred
})

In [None]:
try:
    with open("../../results/teamB-model1.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("../../results/teamB-model1.xlsx", index=False)

In [None]:
# Run if you are running the notebook in Google Colab
try:
    with open("ML/results/teamB-model1.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("ML/results/teamB-model1.xlsx", index=False)

In [None]:
rmse = root_mean_squared_error(y_val, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 35.363599687982216


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
xgboost_scores = cross_val_score(best_model_xg, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
xgboost_rmse_scores = np.sqrt(-xgboost_scores)
display_scores(xgboost_rmse_scores)

In [None]:
# identify the most important features
feature_importances_xg = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model_xg.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances_xg)

## Gradient Boost

In [None]:
# Initialize the model
model_grad = XGBRegressor(
    tree_method='hist',  # GPU support
    device="cuda",
    random_state=42
)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700,800,900],
    'max_depth': [3, 5, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 6, 7,8],
}

# Set up RandomizedSearchCV
random_grad_search = RandomizedSearchCV(
    estimator=model_grad,
    param_distributions=param_grid,
    n_iter=100,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
random_grad_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
print("Best parameters:", random_grad_search.best_params_)

Best parameters: {'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.6}


In [None]:
best_model_grad = random_grad_search.best_estimator_

In [None]:
# Save the best model to a file
with open('best_model_grad_boost_small_test_set.pkl', 'wb') as f:
    pickle.dump(best_model_grad, f)

### Grad Boost with optimal params

In [None]:
train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99004 entries, 0 to 99003
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   COMPANY                      99004 non-null  float64
 1   MILEAGE                      99004 non-null  float64
 2   MILAGE_SALES                 99004 non-null  float64
 3   NUMBER_DOORS                 99004 non-null  float64
 4   NUMBER_SEATS                 99004 non-null  float64
 5   CURB_WEIGHT                  99004 non-null  float64
 6   YEAR_CONSTRUCTION            74366 non-null  float64
 7   NUMBER_AXLE                  99004 non-null  float64
 8   IS_USED_CAR                  99004 non-null  float64
 9   VEHICLE_MODEL_ID             99004 non-null  float64
 10  PURCHASE_MILAGE              99004 non-null  float64
 11  AT_LOCATION_SINCE            99004 non-null  float64
 12  LAID_UP_TIME                 99004 non-null  float64
 13  MILAGE_SALE          

In [None]:
test_cleaned["YEAR_CONSTRUCTION"]

0        31.0
1        39.0
2        38.0
3        38.0
4        39.0
         ... 
42455    37.0
42456    37.0
42457    45.0
42458    36.0
42459    39.0
Name: YEAR_CONSTRUCTION, Length: 42460, dtype: float64

In [None]:
with open('best_model_grad_boost_small_test_set.pkl', 'rb') as f:
    best_model_grad = pickle.load(f)

In [None]:
y_pred = best_model_grad.predict(X_val)

In [None]:
# RUN THIS FOR PREDICTING THE TEST DATASET

test_cleaned_copy = test_cleaned.copy()

chassis_number = test_cleaned_copy['CHASSIS_NUMBER']
test_cleaned_copy = test_cleaned_copy.drop(columns=['CHASSIS_NUMBER', 'LAID_UP_TIME'])

y_pred = best_model_grad.predict(test_cleaned_copy)

result = pd.DataFrame({
    'CHASSIS_NUMBER': chassis_number,
    'LAID_UP_TIME': y_pred
})

In [None]:
try:
    with open("../../results/teamB-model1.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("../../results/teamB-model1.xlsx", index=False)

In [None]:
# Run if you are running the notebook in Google Colab
try:
    with open("ML/results/teamB-model1.xlsx") as f:
        raise FileExistsError
except FileNotFoundError:
    result.to_excel("ML/results/teamB-model1.xlsx", index=False)

OSError: Cannot save file into a non-existent directory: 'ML\results'

In [None]:
rmse = root_mean_squared_error(y_val, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 35.01006204755529


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
grad_boost_scores = cross_val_score(best_model_grad, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
grad_boost_rmse_scores = np.sqrt(-grad_boost_scores)
display_scores(grad_boost_rmse_scores)

Scores: [40.58595301 39.86335579 37.73124196 39.58642202 37.48961423 37.99204791
 39.88347865 39.36880303 38.31171444 38.21755841]
Mean: 38.903018944595715
Standard deviation: 1.020781529523897


In [None]:
# identify the most important features
feature_importances_grad = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model_grad.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances_grad)

                        Feature  Importance
14           PURCHASE_DATE_year    0.188066
31      COMMISSION_TYPE_encoded    0.121602
22         VEHICLE_TYPE_encoded    0.100857
16   PURCHASE_BOOKING_DATE_year    0.091139
13      SCALED_INVENTURAL_VALUE    0.081404
15          PURCHASE_DATE_month    0.045167
8                   IS_USED_CAR    0.032757
17  PURCHASE_BOOKING_DATE_month    0.027530
11            AT_LOCATION_SINCE    0.027067
19    OFFICE_MAIN_BRAND_encoded    0.024944
12                  MILAGE_SALE    0.019882
7                   NUMBER_AXLE    0.019801
0                       COMPANY    0.016325
25          ENGINE_TYPE_encoded    0.016109
20         MANUFACTURER_encoded    0.015451
6             YEAR_CONSTRUCTION    0.015103
18               OFFICE_encoded    0.013944
10              PURCHASE_MILAGE    0.013143
24           UPHOLSTERY_encoded    0.012477
28  FINANCING_TYPE_NAME_encoded    0.011074
26    TRANSMISSION_TYPE_encoded    0.011057
27    TRANSMISSION_NAME_encoded 