In [3]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV

In [2]:
data_path = 'procDF2.csv'
processed_df = pd.read_csv(data_path)
processed_df

Unnamed: 0,DIS,FLO,LOY,PRI,PIM,FEA,LAG,CHO
0,-0.43600,-0.056,1.0,0.1429,0.03510,-0.175710,1.0,9
1,3.85290,-0.576,1.0,0.8136,-0.48915,-0.588930,1.0,9
2,4.58810,0.260,1.0,0.1502,-0.09050,-0.195670,1.0,9
3,61.55460,-0.139,1.0,0.0817,-0.08200,-0.435940,1.0,9
4,1.70100,0.259,1.0,-0.1534,-0.05960,0.015470,1.0,9
...,...,...,...,...,...,...,...,...
3146269,4.10573,0.135,0.0,-2.1534,0.08141,5.524900,1.0,5
3146270,37.22023,2.173,0.0,-2.5829,0.11651,0.619060,1.0,5
3146271,0.03146,0.348,0.0,-1.2892,-0.19316,0.032053,1.0,5
3146272,5.57263,0.168,0.0,-2.1721,0.12531,1.746300,1.0,5


In [3]:
shops_features = ['DIS','FLO','LOY','PRI','PIM','FEA','LAG']
X = processed_df[shops_features]
y = processed_df.CHO


transformer_num = make_pipeline(
    StandardScaler()
)


X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y,stratify=y, test_size=0.33)



X_train = transformer_num.fit_transform(X_train)
X_valid = transformer_num.transform(X_valid)

In [7]:
default_model = RandomForestClassifier(random_state=1)
print("Default parameters in model: \n", default_model.get_params())

Default parameters in model: 
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


In [8]:
param_grid = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": (1, 9),
    "criterion": ["gini", "entropy"],
}
param_grid

{'n_estimators': [100, 200, 300, 400],
 'max_depth': (1, 9),
 'criterion': ['gini', 'entropy']}

In [11]:
opt = BayesSearchCV(
    RandomForestClassifier(random_state=1),
    param_grid,
    n_iter=30,
    n_jobs=-1,
    cv = StratifiedKFold(n_splits=3, shuffle=True),
    random_state=1,
    verbose=0
)

opt.fit(X_train, y_train)



BayesSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
              estimator=RandomForestClassifier(random_state=1), n_iter=30,
              n_jobs=-1, random_state=1,
              search_spaces={'criterion': ['gini', 'entropy'],
                             'max_depth': (1, 9),
                             'n_estimators': [100, 200, 300, 400]})

In [12]:
opt.best_params_

OrderedDict([('criterion', 'gini'), ('max_depth', 9), ('n_estimators', 100)])

In [13]:
opt_best = opt.best_estimator_
print("Best parameters in random: \n", opt_best)
opt_accuracy = cross_val_score(opt_best, X_train, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True))
default_accuracy = cross_val_score(default_model, X_train, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True))
print("RandomSearchCV accuracy: ", opt_accuracy)
print("Default model accuracy: ", default_accuracy)

Best parameters in random: 
 RandomForestClassifier(max_depth=9, random_state=1)
RandomSearchCV accuracy:  [0.63477346 0.63334889 0.63642664]
Default model accuracy:  [0.97225005 0.9726158  0.97191984]


In [4]:
depth_test_model = RandomForestClassifier(random_state=1, criterion='gini',n_estimators=100,max_depth=35)
print("Depth_test_model parameters: \n", depth_test_model.get_params())

Depth_test_model parameters: 
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 35, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


In [6]:
depth_test_accuracy = cross_val_score(depth_test_model, X_train, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True))
print("Depth test accuracy: ", depth_test_accuracy)

Depth test accuracy:  [0.97192984 0.97196969 0.9717448 ]


In [10]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [15]:
dt_model.tree_.max_depth

47

In [16]:
depth_test_model2 = RandomForestClassifier(random_state=1, criterion='gini',n_estimators=100,max_depth=40)
print("Depth_test_model2 parameters: \n", depth_test_model2.get_params())
depth_test_accuracy = cross_val_score(depth_test_model, X_train, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True))
print("Depth test accuracy: ", depth_test_accuracy)

Depth_test_model2 parameters: 
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 40, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Depth test accuracy:  [0.97158687 0.97274958 0.97238379]


Final parameters:DecisionTreeClassifier(random_state=1, criterion='gini',n_estimators=100,max_depth=40)