In [2]:
import numpy as np
import pandas as pd
import random
import os
import shutil
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
import warnings

warnings.filterwarnings("ignore")

In [10]:
# Functions
def grid_search_mae_mse_r2(model, param_grid, x, y, cv=None):
    grid_search_mae = GridSearchCV(model, param_grid, cv=cv, scoring='neg_mean_absolute_error')
    grid_search_mse = GridSearchCV(model, param_grid, cv=cv, scoring='neg_mean_squared_error')
    grid_search_r2 = GridSearchCV(model, param_grid, cv=cv, scoring='r2')
    grid_search_mse.fit(x, y)
    grid_search_mae.fit(x, y)
    grid_search_r2.fit(x, y)
    print("The model having parameters ", grid_search_mae.best_params_, " has the best performance in MAE: ", grid_search_mae.best_score_)
    print("The model having parameters ", grid_search_mse.best_params_, " has the best performance in MSE: ", grid_search_mse.best_score_)
    print("The model having parameters ", grid_search_r2.best_params_, " has the best performance in R2: ", grid_search_r2.best_score_)
    return grid_search_mae.best_estimator_, grid_search_mse.best_estimator_, grid_search_r2.best_estimator_

def halving_grid_search_mae_mse_r2(model, param_grid, x, y, cv=None):
    # grid_search_mae = HalvingRandomSearchCV(model, param_grid, cv=cv, scoring='neg_mean_absolute_error')
    # grid_search_mse = HalvingRandomSearchCV(model, param_grid, cv=cv, scoring='neg_mean_squared_error')
    grid_search_r2 = HalvingRandomSearchCV(model, param_grid, cv=cv, scoring='r2')
    # grid_search_mse.fit(x, y)
    # grid_search_mae.fit(x, y)
    grid_search_r2.fit(x, y)
    # print("The model having parameters ", grid_search_mae.best_params_, " has the best performance in MAE: ", grid_search_mae.best_score_)
    # print("The model having parameters ", grid_search_mse.best_params_, " has the best performance in MSE: ", grid_search_mse.best_score_)
    print("The model having parameters ", grid_search_r2.best_params_, " has the best performance in R2: ", grid_search_r2.best_score_)
    # return grid_search_mae.best_estimator_, grid_search_mse.best_estimator_, grid_search_r2.best_estimator_
    return grid_search_r2.best_estimator_

In [4]:
df = pd.DataFrame(pd.read_csv('..\\clean_data.csv',
                  parse_dates=[0]))

In [5]:
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month

In [6]:
df

Unnamed: 0,cluster_nl_encode,che_perc_gdp,cluster_nl,date,insurance_perc_che,price_month,price_unit,public_perc_che,target,month
0,0.5434049417909654,1.665879,BRAND_354E_COUNTRY_88A3,2014-06-01,1.893333,1.006444,1.013784,1.835821,1.000784,6
1,0.27836938509379616,1.689348,BRAND_626D_COUNTRY_8B47,2014-06-01,1.495874,1.120724,1.626677,1.779263,1.000000,6
2,0.4245175907491331,1.665879,BRAND_45D9_COUNTRY_88A3,2014-06-01,1.893333,1.120724,3.144874,1.835821,1.002258,6
3,0.8447761323199037,2.051770,BRAND_D724_COUNTRY_445D,2014-06-01,1.000000,1.120724,1.213446,1.805970,1.068761,6
4,0.004718856190972565,2.059130,BRAND_4887_COUNTRY_D8B0,2014-06-01,2.013333,1.018589,1.008708,1.880597,1.036312,6
...,...,...,...,...,...,...,...,...,...,...
118826,0.684666782722396,2.058055,BRAND_2058_COUNTRY_C8F4,2022-12-01,1.495874,1.054007,1.100336,2.029851,1.203657,12
118827,0.2546771514048032,1.819485,BRAND_4888_COUNTRY_6F78,2022-12-01,1.173333,1.008317,1.029630,1.955224,1.109272,12
118828,0.09646404569367384,1.491552,BRAND_0056_COUNTRY_0C7D,2022-12-01,1.826667,1.017259,1.018310,1.926795,1.343341,12
118829,0.7853799116612096,2.020277,BRAND_6200_COUNTRY_89F9,2022-12-01,1.495874,1.960978,2.490911,1.985847,1.266831,12


In [7]:
type(df['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [7]:
# id_dict = df.drop_duplicates(subset='cluster_nl').set_index('cluster_nl')['cluster_nl_encode'].to_dict()

In [8]:
cutoff_date = pd.to_datetime('2018-01-01')

train_data = df[df['date'] < cutoff_date]
test_data = df[df['date'] >= cutoff_date]

# Prepare the feature matrix and target vector for training and testing
X_train = train_data[['cluster_nl_encode', 'che_perc_gdp','insurance_perc_che', 'price_month', 'price_unit', 'public_perc_che', 'month']]  # Adjust based on actual column names
y_train = train_data['target']  # Adjust based on actual column name

X_test = test_data[['cluster_nl_encode', 'che_perc_gdp','insurance_perc_che', 'price_month', 'price_unit', 'public_perc_che', 'month']] # Adjust based on actual column names
y_test = test_data['target']  # Adjust based on actual column name

In [11]:
# Grid search
model = RandomForestRegressor(random_state=0)

# param_grid = {
#     'n_estimators': [i for i in np.arange(0, 10000, 100)],
#     'max_depth': [i for i in np.arange(0, 20, 1)],
#     'min_samples_split': [i for i in np.arange(0, 20, 1)],
#     'min_samples_leaf': [i for i in np.arange(0, 20, 1)],
#     'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
#     'max_features': ['sqrt', 'log2', 'None'],
#     'max_leaf_nodes': [i for i in np.arange(0, 20, 1)],
#     'max_samples': [i for i in np.arange(0, 20, 1)],
#     'ccp_alpha': [i for i in np.arange(0, 10, 0.1)]
# }


param_grid = {
    'n_estimators': np.arange(600, 1000, 200),
    # 'max_depth': np.arange(10, 15, 1), 
    # 'min_samples_split': np.arange(1, 10, 2),  
    # 'min_samples_leaf': np.arange(1, 5, 1), 
    # 'max_leaf_nodes': np.arange(10, 20, 5), 
    'ccp_alpha': np.arange(0, 0.1, 0.02) 
}


# tscv = TimeSeriesSplit(n_splits=5)
grid_search_mae_model, grid_search_mse_model, grid_search_r2_model = halving_grid_search_mae_mse_r2(model, param_grid, X_train, y_train)
# grid_search_mae_model, grid_search_mse_model, grid_search_r2_model = halving_grid_search_mae_mse_r2(model, param_grid, X_train, y_train, tscv)

The model having parameters  {'n_estimators': np.int64(600), 'ccp_alpha': np.float64(0.04)}  has the best performance in R2:  -0.7179176561831607


AttributeError: 'HalvingRandomSearchCV' object has no attribute 'best_estimator'

In [None]:
# model = RandomForestRegressor(n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_leaf_nodes=15, max_depth=18, ccp_alpha=0.04, random_state=0)
# model.fit(X_train, y_train)
# # Make predictions on the test data
# y_pred = model.predict(X_test)

# # Evaluate the model's performance
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Display the evaluation metrics
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R-squared (R2): {r2}")