In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [41]:
import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/rossmann/')

In [None]:
! pip install category_encoders

In [38]:
import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler


from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.metrics import mean_squared_error

from sklearn import set_config

# Scikit-Learn estimators will be rendered as interactive diagrams
set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')

# autoreload automatically reloads the modules before executing the code,
# allowing you to see the changes immediately
%load_ext autoreload
%autoreload 2


In [49]:
! ls gdrive/MyDrive/Colab\ Notebooks/rossmann

functions.py  models.ipynb  model_tests.ipynb  train_reduced.csv


In [51]:
import pandas as pd
df = pd.read_csv('gdrive/MyDrive/Colab Notebooks/rossmann/train_reduced.csv')

In [52]:
df.head()

Unnamed: 0,Sales,Customers,Promo,StoreType,Assortment,CompetitionDistance,Promo2,Competition_Since_X_months,weeks_since_promo2,PromoInterval,StateHoliday
0,6068,620,1,d,c,13530.0,0,118,0,0,0
1,4151,559,1,a,a,50.0,0,0,0,0,0
2,5061,399,1,d,c,8990.0,1,164,671,"Mar,Jun,Sept,Dec",0
3,8030,898,0,a,c,3270.0,0,0,0,0,0
4,6710,903,1,a,c,100.0,1,157,688,"Feb,May,Aug,Nov",0


In [54]:
X = df.drop(columns=['Sales'])
y = df.Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [55]:
# Initialize Encoders
le = ce.OrdinalEncoder(mapping=[{'col': 'Assortment', 'mapping': {'a': 0, 'b': 1, 'c': 2}},
                                {'col': 'StoreType',  'mapping': {'a': 0, 'b': 1, 'c': 2, 'd':3}}])

ohe = ce.OneHotEncoder(cols=['Promo', 'Promo2', 'PromoInterval', 'StateHoliday'], handle_unknown="ignore")


# make pipelines for categorical and numerical features
cat_pipeline = Pipeline([ ("label_encoder", le), ('one_hot_encoder', ohe) ])
num_pipeline = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())


#set up the column transfomer
cat_features = ['Assortment', 'StoreType', 'Promo', 'Promo2', 'PromoInterval', 'StateHoliday']
num_features = ['Customers','CompetitionDistance', 'Competition_Since_X_months', 'weeks_since_promo2']

preprocessing = make_column_transformer(
                (cat_pipeline, cat_features),
                (num_pipeline, num_features),
                remainder='passthrough')

In [59]:
#estimate errors
def rmspe(preds, actuals):
    #preds = preds.reshape(-1)
    #actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [60]:
from sklearn.linear_model import LinearRegression

#Build pipeline
lin_reg = make_pipeline(preprocessing,
                        LinearRegression(n_jobs=-1))

#fit the pipeline
lin_reg.fit(X_train,y_train)

# make predictions
lr_pred = lin_reg.predict(X_test)

# score
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)
lr_rmspe = rmspe(lr_pred, y_test)

print(f'Linear Regression performance metrics:')
print(f'Root Mean Square Error RMSE:             {lr_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {lr_rmspe}')

Linear Regression performance metrics:
Root Mean Square Error RMSE:             1520.148515493983
Root Mean Square Percantage Error RMSPE: 22.901245189401795


## Decision Tree Regressor

In [64]:
from sklearn.tree import DecisionTreeRegressor

# initialze tree pipeline
tree_reg = make_pipeline(preprocessing,
                         DecisionTreeRegressor())

# fit-transform tree regressor
tree_reg.fit(X_train, y_train)

#make predictions
pred_tree = tree_reg.predict(X_test)


# Random Forrest

In [None]:
from sklearn.ensemble import RandomForestRegressor

#Build pipeline
pipe_rf = make_pipeline(preprocessing,
                RandomForestRegressor(
                    max_depth=4,
                    random_state=42,
                    n_estimators=300,
                    n_jobs=-1))

#fit the pipeline
pipe_rf.fit(X_train, y_train)

#make predictions
pred_rf = pipe_rf.predict(X_test)


In [62]:
# score
rf_rmse = mean_squared_error(y_test, pred_rf, squared=False)
rf_rmspe = rmspe(pred_rf, y_test)
print(f'Random Forrest Model performance metrics:')
print(f'Root Mean Square Error RMSE:             {rf_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {rf_rmspe}')

Random Forrest Model performance metrics:
Root Mean Square Error RMSE:             1504.3025193834098
Root Mean Square Percantage Error RMSPE: 22.823262692147008


In [66]:
from sklearn.ensemble import AdaBoostRegressor

#Build pipeline
pipe_adaboost = make_pipeline(preprocessing,
                              AdaBoostRegressor(
                                  random_state=42,
                                  n_estimators=300,
                                  loss='square'))

#fit the pipeline
pipe_adaboost.fit(X_train, y_train)

#make predictions
pred_ad = pipe_adaboost.predict(X_test)

# score
ad_rmse = mean_squared_error(y_test, pred_ad, squared=False)
ad_rmspe = rmspe(pred_ad, y_test)
print(f'Ada Boost Regressor performance metrics:')
print(f'Root Mean Square Error RMSE:             {ad_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {ad_rmspe}')

Ada Boost Regressor performance metrics:
Root Mean Square Error RMSE:             2670.4671355910805
Root Mean Square Percantage Error RMSPE: 51.53735738700827


# Grid search - DecsisonTreeRegression

In [77]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

tree_reg = Pipeline(steps=[('preprocessor', preprocessing),
                           ('dt', DecisionTreeRegressor())])

# Define the parameter grid for the grid search
# to pass the hyperparameters to the model inside the pipeline you need to adress the model by it's name and with __ 'dt__hyperparameter'
param_grid = {
    'dt__criterion': ['friedman_mse'],  # Measure for quality of split
    'dt__splitter': ['best'],  # Strategy used to choose the split at each node
    'dt__max_depth': [None],  # Maximum depth of the tree (None for unlimited depth)
    'dt__min_samples_split': [10, 15, 20],  # Minimum number of samples required to split an internal node
    'dt__min_samples_leaf': [2, 3, 4],  # Minimum number of samples required to be at a leaf node
    'dt__max_features': [None]  # Number of features to consider when looking for the best split
}

# Perform the grid search
dt_search = GridSearchCV(tree_reg, param_grid, cv=5, n_jobs=-1)
dt_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = dt_search.best_params_

In [75]:
best_params

{'dt__criterion': 'friedman_mse',
 'dt__max_depth': None,
 'dt__max_features': None,
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 10,
 'dt__splitter': 'best'}

In [79]:
dt_search.best_score_

0.9469497018958115

In [87]:
column_order = ['mean_test_score', 'std_test_score', 'rank_test_score', 'param_dt__criterion', 'param_dt__max_depth', 'param_dt__max_features',
       'param_dt__min_samples_leaf', 'param_dt__min_samples_split','param_dt__splitter', 'params',]

In [88]:
#cv_res = pd.DataFrame(dt_search.cv_results_)
#cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res[column_order].head()

Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_dt__criterion,param_dt__max_depth,param_dt__max_features,param_dt__min_samples_leaf,param_dt__min_samples_split,param_dt__splitter,params
1,0.94695,0.001353,1,friedman_mse,,,2,15,best,"{'dt__criterion': 'friedman_mse', 'dt__max_dep..."
4,0.946526,0.001307,2,friedman_mse,,,3,15,best,"{'dt__criterion': 'friedman_mse', 'dt__max_dep..."
0,0.946185,0.000767,3,friedman_mse,,,2,10,best,"{'dt__criterion': 'friedman_mse', 'dt__max_dep..."
3,0.946163,0.000818,4,friedman_mse,,,3,10,best,"{'dt__criterion': 'friedman_mse', 'dt__max_dep..."
2,0.946145,0.001574,5,friedman_mse,,,2,20,best,"{'dt__criterion': 'friedman_mse', 'dt__max_dep..."


# Decicion Tree Regressor  - Best Params

In [93]:
from sklearn.tree import DecisionTreeRegressor

# initialze tree pipeline
best_tree_reg = make_pipeline(preprocessing,
                         DecisionTreeRegressor(
                            criterion = 'friedman_mse',
                            max_depth = None,
                            max_features = None,
                            min_samples_leaf = 2,
                            min_samples_split = 10,
                            splitter = 'best'
                            ))

# fit-transform tree regressor
best_tree_reg.fit(X_train, y_train)

#make predictions
best_tree_pred = tree_reg.predict(X_test)

In [94]:
# score
best_rmspe = rmspe(best_tree_pred, y_test)
best_rmse = mean_squared_error(y_test, best_tree_pred, squared=False)
print(f'Decision Tree Regressor with best hyper-params performance metrics:')
print(f'Root Mean Square Error RMSE:             {best_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {best_rmspe}')

Decision Tree Regressor with best hyper-params performance metrics:
Root Mean Square Error RMSE:             691.9136117358657
Root Mean Square Percantage Error RMSPE: 9.2368830358228


In [98]:

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Assuming you have a trained model named 'optimal_model'
# Choose an appropriate evaluation metric (e.g., mean squared error for regression)
scoring_metric = 'neg_mean_squared_error'

# Perform cross-validation using KFold with k=5 folds
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get the mean score and standard deviation
scores = cross_val_score(best_tree_reg, X_train, y_train, cv=cv, scoring=scoring_metric)
mean_score = -scores.mean()
std_score = scores.std()

# Print the results
print(f"Cross-validated Mean {scoring_metric}: {mean_score:.2f}")
print(f"Cross-validated Standard Deviation: {std_score:.2f}")

# Further fine-tune hyperparameters based on cross-validation results
# Continue training or adjusting hyperparameters as needed based on the performance evaluation


Cross-validated Mean neg_mean_squared_error: 506003.68
Cross-validated Standard Deviation: 12471.70
