<a href="https://colab.research.google.com/github/JONICK277/ML/blob/jost/code/model_evaluation/model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Initialization***



## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Data Download

In [None]:
# Execute this if you are running the notebook in Google Colab
!git clone https://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@github.com/JONICK277/ML.git
train_cleaned = pd.read_pickle("ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("ML/data/cleaned/test/test_cleaned.pkl")

In [None]:
# Load the cleaned data
train_cleaned = pd.read_pickle("../../data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("../../data/cleaned/test/test_cleaned.pkl")

# ***Preparation***

In [None]:
target = "LAID_UP_TIME"
X = train_cleaned.drop(columns=[target])
y = train_cleaned[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# ***Models***

## Random Forest Regressor


In [None]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 37.07677285364513


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
forest_scores = cross_val_score(model, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [42.57535887 41.47482797 41.89615578 41.79385968 39.62868291 41.61047281
 42.69608343 40.61671081 41.10268999 42.6018996 ]
Mean: 41.59967418430167
Standard deviation: 0.9163983332722933


In [None]:
# identify the most important features
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                           Feature  Importance
46              PURCHASE_DATE_year    0.171865
41         SCALED_INVENTURAL_VALUE    0.170071
47             PURCHASE_DATE_month    0.076165
50      PURCHASE_BOOKING_DATE_year    0.050008
6                     VEHICLE_TYPE    0.049595
36                 COMMISSION_TYPE    0.042716
51     PURCHASE_BOOKING_DATE_month    0.036622
40            SCALED_CURRENT_VALUE    0.027074
18          PERMITTED_TOTAL_WEIGHT    0.025654
38               AT_LOCATION_SINCE    0.025597
3                   CHASSIS_NUMBER    0.017577
39                     MILAGE_SALE    0.016994
22              CONSTRUCTION_MONTH    0.016966
7                          MILEAGE    0.016607
13                     ENGINE_TYPE    0.015598
0                          COMPANY    0.015008
8                     MILAGE_SALES    0.014628
45         SCALED_TOTAL_SALE_PRICE    0.014406
43              SCALED_GUIDE_PRICE    0.013903
1                           OFFICE    0.013208
42           