<a href="https://colab.research.google.com/github/JONICK277/ML/blob/jost/code/model_evaluation/model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Initialization***



## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

## Data Download

In [2]:
# Execute this if you are running the notebook in Google Colab
!git clone https://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@github.com/JONICK277/ML.git
train_cleaned = pd.read_pickle("ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("ML/data/cleaned/test/test_cleaned.pkl")

Cloning into 'ML'...
remote: Enumerating objects: 287, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 287 (delta 4), reused 2 (delta 2), pack-reused 270 (from 1)[K
Receiving objects: 100% (287/287), 306.28 MiB | 23.87 MiB/s, done.
Resolving deltas: 100% (80/80), done.
Updating files: 100% (39/39), done.


In [3]:
# Load the cleaned data
train_cleaned = pd.read_pickle("/content/ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("/content/ML/data/cleaned/test/test_cleaned.pkl")

# ***Preparation***

In [4]:
target = "LAID_UP_TIME"
X = train_cleaned.drop(columns=[target])
y = train_cleaned[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# ***Models***

## Random Forest Regressor


In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [7]:
# Initialize the model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

In [11]:
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [8]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 37.05565766574534


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
forest_scores = cross_val_score(model, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [ 55.19188952  51.97113777  83.9596482   53.91507325  85.41403271
  46.80983444  56.34191747 184.8806717   48.72954404  48.42614876]
Mean: 71.56398978626434
Standard deviation: 40.063459037945336


In [None]:
# identify the most important features
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                           Feature  Importance
46              PURCHASE_DATE_year    0.171865
41         SCALED_INVENTURAL_VALUE    0.170071
47             PURCHASE_DATE_month    0.076165
50      PURCHASE_BOOKING_DATE_year    0.050008
6                     VEHICLE_TYPE    0.049595
36                 COMMISSION_TYPE    0.042716
51     PURCHASE_BOOKING_DATE_month    0.036622
40            SCALED_CURRENT_VALUE    0.027074
18          PERMITTED_TOTAL_WEIGHT    0.025654
38               AT_LOCATION_SINCE    0.025597
3                   CHASSIS_NUMBER    0.017577
39                     MILAGE_SALE    0.016994
22              CONSTRUCTION_MONTH    0.016966
7                          MILEAGE    0.016607
13                     ENGINE_TYPE    0.015598
0                          COMPANY    0.015008
8                     MILAGE_SALES    0.014628
45         SCALED_TOTAL_SALE_PRICE    0.014406
43              SCALED_GUIDE_PRICE    0.013903
1                           OFFICE    0.013208
42           