Hult International Business School 
Assignment: Individual Classification Assignment 
Student : Mosiuwa Tshabalala 
Subject : Machine Learning 😊

In [1]:
#importing the relevant packages for data science essentials, modeling and tuning
import pandas as pd
import numpy as np    
from sklearn.model_selection import RandomizedSearchCV                   
from sklearn.model_selection import train_test_split                   
from sklearn.model_selection import GridSearchCV             
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler     
from sklearn.metrics import confusion_matrix        
from sklearn.metrics import roc_auc_score          
from sklearn.metrics import make_scorer

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

file = 'Apprentice_Chef_Dataset.xlsx'
model_performance = pd.read_excel('./model_results/classification_model_performance.xlsx')

ac_dataset = pd.read_excel(file)

In [2]:
# creating a dictionary to store candidate models

ac_logdict = {

 # full model
 'ac_logfull'   : ['REVENUE', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH','CONTACTS_W_CUSTOMER_SERVICE', 
                     'PRODUCT_CATEGORIES_VIEWED', 'AVG_TIME_PER_SITE_VISIT', 'MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON', 
                     'CANCELLATIONS_AFTER_NOON','TASTES_AND_PREFERENCES', 'PC_LOGINS', 
                     'MOBILE_LOGINS', 'WEEKLY_PLAN', 'EARLY_DELIVERIES','LATE_DELIVERIES', 
                     'PACKAGE_LOCKER', 'REFRIGERATED_LOCKER', 'AVG_PREP_VID_TIME', 
                     'LARGEST_ORDER_SIZE', 'MASTER_CLASSES_ATTENDED',  'MEDIAN_MEAL_RATING', 
                     'AVG_CLICKS_PER_VISIT', 'TOTAL_PHOTOS_VIEWED']
}


In [3]:
ac_datadrop = ac_dataset.drop(['NAME','EMAIL', 'FIRST_NAME', 'FAMILY_NAME'], axis = 1)
ac_data   =  ac_datadrop.loc[ : , ac_logdict['ac_logfull']]
ac_target =  ac_dataset.loc[ : , 'CROSS_SELL_SUCCESS']

#  Processing the  model creating the scaled dataframe
scaler = StandardScaler()

scaler.fit(ac_data)

x_scaled = scaler.transform(ac_data)

x_train, x_test, y_train, y_test = train_test_split(x_scaled,
                                                    ac_target, 
                                                    test_size = 0.25,
                                                    random_state = 219,
                                                    stratify = ac_target
                                                    )

In [4]:
# creating a hyperparameter grid
param_grid = {'max_depth'        : [8],
              'min_samples_leaf' : [1],
              'splitter'         : ['random'],
              'criterion'        : ['gini']}


# Instantiating the model object using best hyperparameters

ac_dtmodel = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
model_tuned = RandomizedSearchCV( estimator         = ac_dtmodel,
                                param_distributions = param_grid,
                                cv                  = 3,
                                n_iter              = 1000,
                                random_state        = 219,
                                )

# FITTING to the FULL DATASET (due to cross-validation)
model_fin = model_tuned.fit(x_scaled, ac_target)

# PREDICTING based on the testing set
model_fit_pred = (model_fin.predict_proba(x_test)[:,1]>=0.59).astype(int)



In [5]:
tuned_dt_train_acc = model_fin.score(x_train, y_train).round(4)
tuned_dt_test_acc  = model_fin.score(x_test, y_test).round(4)
tuned_dt_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = model_fit_pred).round(4)
dt_tn, \
dt_fp, \
dt_fn, \
dt_tp = confusion_matrix(y_true = y_test, y_pred = model_fit_pred).ravel()

# creating a dictionary for model results
model_performance = {
    
    'Model Name'    : ['Decision Tree'],
           
    'AUC Score' : [tuned_dt_auc],
    
    'Training Accuracy' : [tuned_dt_train_acc],
           
    'Testing Accuracy'  : [tuned_dt_test_acc],

    'Confusion Matrix'  : [(dt_tn, dt_fp, dt_fn,dt_tp)]}

# converting model_performance into a DataFrame
model_performance_df = pd.DataFrame(model_performance)


# sending model results to Excel
model_performance_df.to_excel('./model_results/classification_model_performance.xlsx',
                           index = False)

In [6]:
# creating a hyperparameter grid
param_grid = {
    'bootstrap': [False],
    'criterion': ['entropy'],
    'max_depth': [0,8],
    'max_features' : ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12,20],
    'n_estimators': [100, 200, 300, 1000]
    }

# instantiating the model object using best parameters
ac_rfmodel = RandomForestClassifier()

ac_grid = GridSearchCV(estimator = ac_rfmodel, 
                       param_grid = param_grid, 
                       cv = 3,
                       n_jobs = -1)
# FITTING to the FULL DATASET (due to cross-validation)
ac_grid_fit = ac_grid.fit(x_scaled, ac_target)


# PREDICT step is not needed
model_pred = (ac_grid_fit.predict_proba(x_test)[:,1]>=0.59).astype(int)

# declaring model performance objects
tuned_rf_train_acc = ac_grid_fit.score(x_train, y_train).round(4)
tuned_rf_test_acc  = ac_grid_fit.score(x_test, y_test).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = model_pred).round(4)
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = model_pred).ravel()

# appending to model_performance
model_performance = model_performance_df.append(
                          {'Model Name'         : 'Random Forest',
                           'Training Accuracy'  : tuned_rf_train_acc,
                           'Testing Accuracy'   : tuned_rf_test_acc,
                           'AUC Score'          : tuned_rf_auc,
                           'Confusion Matrix'   : (rf_tn,
                                                   rf_fp,
                                                   rf_fn,
                                                   rf_tp)
                          },
                          ignore_index = True)
# sending model results to Excel
model_performance_df.to_excel('./model_results/classification_model_performance.xlsx',
                           index = False)


In [2]:
# checking the results
print(f'''Based on the model performance I have decided to choose the Random Forest Model as my most 
optimal model. This model shows the best balance between specifity and recall & the difference between 
false negatives and true negatives is large showing that our model classifications are not left to chance. 
Although the Decision Tree classified model has larger training and testing scores the Random forest 
has the best distribution of the Confusion Matrix and hence a higher AUC Score
FINAL MODEL: RANDOM FOREST''') 
model_performance

Based on the model performance I have decided to choose the Random Forest Model as my most 
optimal model. This model shows the best balance between specifity and recall & the difference between 
false negatives and true negatives is large showing that our model classifications are not left to chance. 
Although the Decision Tree classified model has larger training and testing scores the Random forest 
has the best distribution of the Confusion Matrix and hence a higher AUC Score
FINAL MODEL: RANDOM FOREST


NameError: name 'model_performance' is not defined