In [None]:
from basic_functions import *
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
#get data
df = get_data_from_csv()
df.head()

In [None]:
# do some feature engineering
df = feature_engineering(df)
df.head()

In [None]:
# split into train and test data
X_train, X_test, y_train, y_test = tts_custom(df, RSEED = 42)

In [None]:
# Do SMOTENC
X_train_sm, y_train_sm = custom_smote(X_train, y_train, 42)

In [None]:
# distinguish between numerical and categorical features
cat_features = [
    "ProviderId", "ProductCategory", "ChannelId", "PricingStrategy", "weekday", "difference", "InOut"
    ]
num_features = ["Value", "time_of_day"]

# scale numerical features
X_train_sm_sc, X_test_sc = custom_preprocess(X_train_sm, X_test, nf=num_features)

In [None]:
# Create dummy variables for training and test set
X_train_sm_sc, cat_features_dummies = cust_dummies(X_train_sm_sc, cat_features)
X_test_sc, cat_features = cust_dummies(X_test_sc, cat_features)

In [None]:
# Train a logistic regression classifier
y_train_sm_lr, y_test_lr = custom_logreg(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_lr)

In [None]:
# Train a Naive Bayes classifier
y_train_sm_nb, y_test_nb = custom_nb(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_nb)

In [None]:
# Train a Random Forest classifier
y_train_sm_rf, y_test_rf = custom_rf(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_rf)

In [None]:
# Train a KNN classifier
y_train_sm_knn, y_test_knn = custom_knn(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_knn)

In [None]:
# Train a support vector machines classifier
y_train_sm_svc, y_test_svc = custom_svc(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_svc)

In [None]:
# train a stacking classifier
y_train_sm_stack, y_test_stack = custom_knn(X_train_sm_sc, X_test_sc, y_train_sm, y_test)
confusion_matrix(y_test, y_test_stack)

In [None]:
############################
#       Show scores        #
############################

# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores Logistic Regression:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_lr)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_lr)))

# Calculating the accuracy for the RandomForest Classifier 
print('Cross validation scores Random Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rf)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rf)))

# Calculating the accuracy for the KNN Classifier 
print('Cross validation scores KNN:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_knn)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_knn)))

# Calculating the accuracy for the SVM Classifier 
print('Cross validation scores SVM:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_svc)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_svc)))

# Calculating the accuracy for the Naive Bayes Classifier 
print('Cross validation scores Naive Bayes:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_nb)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_nb)))

# Calculating the accuracy for the stacking Classifier 
print('Cross validation scores Stack:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_stack)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_stack)))

In [None]:
########################################################
#     Random search for Random Forest classifier       #
########################################################
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_sm_sc, y_train_sm)


In [None]:
###################################################
#     Get and show scores for random search       #
###################################################
best_random = rf_random.best_estimator_
y_test_rfrand = best_random.predict(X_test_sc)

# Base RandomForest Classifier 
print('Cross validation scores Base Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rf)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rf)))

# Random search RandomForest Classifier 
print('Cross validation scores Random search Random Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rfrand)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rfrand)))

In [None]:
# best parameters random search
rf_random.best_params_

In [None]:
###################################################
#     Grid search for Random Forest Classifier    #
###################################################

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [15, 20, 25],
    'min_samples_split': [1, 2, 3, 4],
    'n_estimators': [1700, 1800, 1900]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train_sm_sc, y_train_sm)


In [None]:
###################################################
#     Get and show scores for grid search       #
###################################################
best_grid = grid_search.best_estimator_
y_test_rfgrid = best_grid.predict(X_test_sc)

# Base RandomForest Classifier 
print('Cross validation scores Base Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rf)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rf)))

# Random search Random Forest Classifier 
print('Cross validation scores Random search Random Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rfrand)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rfrand)))

# Grid search Random Forest Classifier 
print('Cross validation scores Grid search Random Forest:')
print('-------------------------')
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_rfgrid)))
print("MCC: {:.2f}".format(matthews_corrcoef(y_test, y_test_rfgrid)))

print('Best parameters random search:')
print('-------------------------')
print(rf_random.best_params_)

print('Best parameters grid search:')
print('-------------------------')
print(grid_search.best_params_)



A warning message from grid search (was it produced during random search, too?).
>54 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
