# ***Initialization***



## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

## Data Download

In [5]:
# Execute this if you are running the notebook in Google Colab
!git clone https://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@github.com/JONICK277/ML.git
train_cleaned = pd.read_pickle("ML/data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("ML/data/cleaned/test/test_cleaned.pkl")

fatal: destination path 'ML' already exists and is not an empty directory.


In [4]:
# Load the cleaned data
train_cleaned = pd.read_pickle("../../data/cleaned/train/train_cleaned.pkl")
test_cleaned = pd.read_pickle("../../data/cleaned/test/test_cleaned.pkl")

# ***Preparation***

In [5]:
target = "LAID_UP_TIME"
X = train_cleaned.drop(columns=[target])
y = train_cleaned[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# ***Models***

## Random Forest Regressor


In [11]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [13]:
# Initialize the model
model = RandomForestRegressor()

# Train the model
#model.fit(X_train, y_train)

In [14]:
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 37.05565766574534


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
forest_scores = cross_val_score(model, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [ 55.19188952  51.97113777  83.9596482   53.91507325  85.41403271
  46.80983444  56.34191747 184.8806717   48.72954404  48.42614876]
Mean: 71.56398978626434
Standard deviation: 40.063459037945336


In [None]:
# identify the most important features
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                           Feature  Importance
46              PURCHASE_DATE_year    0.171865
41         SCALED_INVENTURAL_VALUE    0.170071
47             PURCHASE_DATE_month    0.076165
50      PURCHASE_BOOKING_DATE_year    0.050008
6                     VEHICLE_TYPE    0.049595
36                 COMMISSION_TYPE    0.042716
51     PURCHASE_BOOKING_DATE_month    0.036622
40            SCALED_CURRENT_VALUE    0.027074
18          PERMITTED_TOTAL_WEIGHT    0.025654
38               AT_LOCATION_SINCE    0.025597
3                   CHASSIS_NUMBER    0.017577
39                     MILAGE_SALE    0.016994
22              CONSTRUCTION_MONTH    0.016966
7                          MILEAGE    0.016607
13                     ENGINE_TYPE    0.015598
0                          COMPANY    0.015008
8                     MILAGE_SALES    0.014628
45         SCALED_TOTAL_SALE_PRICE    0.014406
43              SCALED_GUIDE_PRICE    0.013903
1                           OFFICE    0.013208
42           

## XGBOOST


In [19]:
from xgboost import XGBRegressor, DMatrix


model = XGBRegressor(tree_method='hist', device= "cuda", random_state=42)
random_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [3, 5, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

rf_random = RandomizedSearchCV(
    estimator=model,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rf_random.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [21]:
print("Best parameters:", rf_random.best_params_)


Best parameters: {'subsample': 0.8, 'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [22]:
best_model = rf_random.best_estimator_


In [23]:
# using cross-validation (it takes a while ca. 10 mins)
xgboost_scores = cross_val_score(best_model, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
xgboost_rmse_scores = np.sqrt(-xgboost_scores)
display_scores(xgboost_rmse_scores)

Scores: [42.05628486 41.16991098 40.66609466 41.23087233 38.82846084 40.02339349
 41.46395893 39.83299895 40.36888663 41.55494252]
Mean: 40.71958041865486
Standard deviation: 0.9206415209383854


In [14]:
grid_search = GridSearchCV(
    estimator=best_model,
    param_grid={
        'n_estimators': [250, 300, 350],
        'max_depth': [5, 6, 7]
    },
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Best Grid Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Grid Parameters: {'max_depth': 7, 'n_estimators': 350}


## Gradient Boost

In [6]:
from xgboost import XGBRegressor, DMatrix

In [42]:
x_train = X_train.to_numpy()  # If it's a Pandas DataFrame
Y_train = y_train.to_numpy()  # If it's a Pandas Series

In [44]:
dtrain = DMatrix(x_train, label=Y_train)

In [14]:
# Initialize the model
model = XGBRegressor(
    tree_method='hist',  # GPU support
    device="cuda",
    predictor='gpu_predictor',
    random_state=42
)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700,800,900],
    'max_depth': [3, 5, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],          # Subsample ratio of columns
    'min_child_weight': [1, 3, 5, 6, 7,8],                # Minimum sum of weights for a split
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,                   # Number of combinations to try
    scoring='neg_mean_squared_error',  # Use MSE as the scoring metric
    cv=3,                        # 5-fold cross-validation
    verbose=2,                   # Print progress
    random_state=42,
    n_jobs=-1                    # Use all CPUs
)

# Fit the model
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



In [15]:
print("Best parameters:", random_search.best_params_)


Best parameters: {'subsample': 0.8, 'n_estimators': 600, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.6}


In [None]:
# using cross-validation (it takes a while ca. 10 mins)
xgboost_scores = cross_val_score(random_search, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
xgboost_rmse_scores = np.sqrt(-xgboost_scores)
display_scores(xgboost_rmse_scores)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "predictor" } are not used.



Fitting 3 folds for each of 50 candidates, totalling 150 fits
