<a href="https://colab.research.google.com/github/JONICK277/ML/blob/jost/code/model_evaluation/model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Initialization***



## Imports

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Data Download

In [2]:
!git clone https://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@github.com/JONICK277/ML.git

Cloning into 'ML'...
remote: Enumerating objects: 214, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 214 (delta 8), reused 24 (delta 6), pack-reused 178 (from 2)[K
Receiving objects: 100% (214/214), 341.59 MiB | 25.18 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [4]:
# Load the cleaned data
df_cleaned = pd.read_pickle("ML/data/cleaned/df_cleaned.pkl")

# ***Preparation***

In [5]:
# Separate target and features
target = "LAID_UP_TIME"
X = df_cleaned.drop(columns=[target])
y = df_cleaned[target]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# ***Models***

#Random Forest Regressor


In [7]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [13]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 70.49364706770096


In [16]:
#using cross-validation (it takes a while ca. 10 mins)
forest_scores = cross_val_score(model, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [107.81388707  52.22378502  50.17637369  51.46348477  78.67588093
  51.90493543  62.99173639  46.94772562 184.87138788  47.53443705]
Mean: 73.46036338503879
Standard deviation: 41.28690342017184


In [10]:
#identify the most important features
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)


                           Feature  Importance
45              PURCHASE_DATE_year    0.178149
40         SCALED_INVENTURAL_VALUE    0.173141
46             PURCHASE_DATE_month    0.076830
35                 COMMISSION_TYPE    0.043714
49      PURCHASE_BOOKING_DATE_year    0.039147
50     PURCHASE_BOOKING_DATE_month    0.034810
37               AT_LOCATION_SINCE    0.027610
39            SCALED_CURRENT_VALUE    0.026375
38                     MILAGE_SALE    0.025771
1                           OFFICE    0.023961
6                          MILEAGE    0.021896
7                     MILAGE_SALES    0.021769
5                     VEHICLE_TYPE    0.021202
17          PERMITTED_TOTAL_WEIGHT    0.020596
36                 PURCHASE_MILAGE    0.019171
8                            COLOR    0.015428
44         SCALED_TOTAL_SALE_PRICE    0.014630
12                     ENGINE_TYPE    0.014525
42              SCALED_GUIDE_PRICE    0.014203
43  SCALED_TOTAL_SALES_PRICE_BASIS    0.014062
41           

# ***Saving***

In [17]:
# Save the DataFrame as a pickle file
df_cleaned.to_pickle("ML/data/cleaned/df_cleaned.pkl")

