In [1]:
#%%time
""" 
    Apprentice Chef Case
    Regression Model Building for DAT-5304 Machine Learning.
    The course is part of the MS. in Business Analytics at HULT International
    Business School.
    
    Author: Maximilian Paulus
    Submission Date: February 5th, 2020"""

""" 
DocString:
    a) Introduction:
    This Document contains a streamlined version of the exploratory data analysis,
    feature engineering, model building, selection and tuning, that was performed
    within the scope of predicting cross sales success from Apprentice Chef Customers.
    
    b) Known Errors or Bugs:
        -
"""
# Importing Required Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf # linear regression (statsmodels)
import sklearn.linear_model # linear models
from scipy.stats import randint
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression # linear regression (scikit-learn)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.linear_model import LogisticRegression  # logistic regression
from sklearn.linear_model import RidgeClassifier  # ridge classifier
from sklearn.linear_model import Perceptron  # Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier  # Passive Aggressive Classifier
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.preprocessing import StandardScaler     # standard scaler

# CART model packages
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import export_graphviz             # exports graphics
from sklearn.externals.six import StringIO           # saves objects in memory
from IPython.display import Image                    # displays on frontend
import pydotplus                                     # interprets dot objects


# new packages
from sklearn.model_selection import GridSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer              # customizable scorer

########################################
# display_tree
########################################
def display_tree(tree, feature_df, height = 500, width = 800, export = False):
    """
    PARAMETERS
    ----------
    tree       : fitted tree model object
        fitted CART model to visualized
    feature_df : DataFrame
        DataFrame of explanatory features (used to generate labels)
    height     : int, default 500
        height in pixels to which to constrain image in html
    width      : int, default 800
        width in pixels to which to constrain image in html
    export     : bool, defalut False
        whether or not to export the tree as a .png file
    """

    # visualizing the tree
    dot_data = StringIO()

    
    # exporting tree to graphviz
    export_graphviz(decision_tree      = tree,
                    out_file           = dot_data,
                    filled             = True,
                    rounded            = True,
                    special_characters = True,
                    feature_names      = feature_df.columns)


    # declaring a graph object
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())


    # creating image
    img = Image(graph.create_png(),
                height = height,
                width  = width,
                unconfined = True)


    return img


########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = X_train_tree.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(25,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')
        
########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()
    
    
########################################
# optimal_neighbors
########################################
def optimal_neighbors(X_data,
                      y_data,
                      standardize = True,
                      pct_test=0.25,
                      seed=802,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
X_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the X data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 802
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing X_data
        scaler             = StandardScaler()
        scaler.fit(X_data)
        X_scaled           = scaler.transform(X_data)
        X_scaled_df        = pd.DataFrame(X_scaled)
        X_data             = X_scaled_df



    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(X_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(X_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(X_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(X_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1



In [2]:
# Importing Apprentice Chef Dataset
filename = 'Apprentice_Chef_Dataset.xlsx'

chef_df = pd.read_excel(filename)




In [3]:
#chef_df.loc[:, :].quantile([0.20,
#                           0.40,
#                           0.60,
#                           0.80,
#                           1.00])

In [4]:
#chef_df['AVG_ORDER_REV'].describe().round(2)

In [5]:
#chef_df.info()

Checking for missing values and imputing missing family names with 'n/a'

In [6]:

# Checking for missing values, creating missing value flag and imputing missing family names
# chef_df.isna().sum()

if chef_df['FAMILY_NAME'].isnull().astype(int).sum() > 0:
    chef_df['MISSING_FAMILY_NAME'] = chef_df['FAMILY_NAME'].isnull().astype(int)
fill = 'n/a'
chef_df['FAMILY_NAME'] = chef_df['FAMILY_NAME'].fillna(fill)

The target variable for the subsequent analysis is 'CROSS_SELL_SUCCESS'. Looking at the value counts checks for class imbalance (imbalance between success and failure in target variable). The classes in this case are 0 and 1, representing cross sell success and not success

In [7]:
chef_df['CROSS_SELL_SUCCESS'].value_counts()

1    1321
0     625
Name: CROSS_SELL_SUCCESS, dtype: int64

Given the value counts of 1321 and 625 there is a slight class imbalance underlying that requires stratification of samples when splitting the dataset

Splitting email adresses into address and domain section to classify domains as personal, professional or spam

In [8]:

# Splitting Email Addresses into address and domain
# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in chef_df.iterrows():
    
    # splitting email domain at '@'
    split_email = chef_df.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)
email_df.columns = ['EMAIL_ADDRESS', 'DOMAIN']
chef_df['EMAIL_ADDRESS'] = email_df['EMAIL_ADDRESS']
chef_df['DOMAIN'] = email_df['DOMAIN']

# displaying the results
# chef_df

# Classyfing Domains into personal and non-personal

personal_mail_list = ['gmail.com', 'yahoo.com', 'protonmail.com']
junk_mail_list = ['me.com','aol.com','hotmail.com','live.com','msn.com','passport.com']

technology_domain_list = ['apple.com','ibm.com','microsoft.com','verizon.com',
                          'unitedtech.com','cisco.com','intel.com']
financial_domain_list = ['amex.com','travellers.com','visa.com','jpmorgan.com'
                         ,'goldmansacs.com']

############################################
# The industry specific domain lists are extracted from 
# clearbit enrichment data on the existing professional domains
# https://docs.google.com/spreadsheets/d/1erIdqoy60JwLAnpb91EfoJV5YrXDnbwSaA-aqcBlw48/edit#gid=1561611259
# The industry naming is changed for convenience purposes
############################################


# looping over the domain column to identify personal domains
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'DOMAIN'] in personal_mail_list:
        chef_df.loc[index, 'IS_PERSONAL'] = int(1)
    else:
        chef_df.loc[index, 'IS_PERSONAL'] = int(0)
chef_df['IS_PERSONAL'] = chef_df['IS_PERSONAL'].astype('int64')       

# looping over the domain column to identify spam / junk domains
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'DOMAIN'] in junk_mail_list:
        chef_df.loc[index, 'IS_SPAM'] = int(1)
    else:
        chef_df.loc[index, 'IS_SPAM'] = int(0)
chef_df['IS_SPAM'] = chef_df['IS_SPAM'].astype('int64')  

# looping over the domain column to classify domains that are not junk or personal as professional
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'DOMAIN'] not in junk_mail_list and chef_df.loc[index, 'DOMAIN'] not in personal_mail_list:
        chef_df.loc[index, 'IS_PROFESSIONAL'] = int(1)
    else:
        chef_df.loc[index, 'IS_PROFESSIONAL'] = int(0)
chef_df['IS_PROFESSIONAL'] = chef_df['IS_PROFESSIONAL'].astype('int64')         
  
    
# looping over the domain column to classify domains from financial industry
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'DOMAIN'] in financial_domain_list:
        chef_df.loc[index, 'IS_FINANCIAL'] = int(1)
    else:
        chef_df.loc[index, 'IS_FINANCIAL'] = int(0)
chef_df['IS_FINANCIAL'] = chef_df['IS_FINANCIAL'].astype('int64')         
        
# looping over the domain column to classify domains from tech industry
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'DOMAIN'] in technology_domain_list:
        chef_df.loc[index, 'IS_TECH'] = int(1)
    else:
        chef_df.loc[index, 'IS_TECH'] = int(0)
chef_df['IS_TECH'] = chef_df['IS_TECH'].astype('int64')         
            
    
    
# Checking the profesional domains for wrongly categorized domains
#chef_df['DOMAIN'][chef_df['IS_PROFESSIONAL'] == 1].value_counts()




Visual EDA (From Assignment 1). We take the same outlier thresholds for the explanatory variables that we defined in Assignment 1



In [9]:
##############################################################################
# Visual EDA (Histograms)
########################
# Setting Outlier Thresholds

total_meals_hi = 150
total_meals_lo = 0
unique_meals_hi = 10
customer_service_hi = 8
avg_time_hi = 200
cancellations_before_hi = 5
weekly_plan_hi = 20
prep_vid_time_hi = 250
late_deliveries_hi = 7
early_deliveries_hi = 1
master_classes_hi = 2
avg_clicks_lo = 8
avg_clicks_hi = 19
total_photos_hi = 1


#fig, ax = plt.subplots(figsize = (10, 8))
#plt.subplot(2, 2, 1)
#sns.distplot(chef_df['TOTAL_MEALS_ORDERED'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'orange')
#plt.xlabel('TOTAL_MEALS_ORDERED')
#plt.axvline(x = total_meals_hi)
#plt.axvline(x = total_meals_lo)

########################

#plt.subplot(2, 2, 2)
#sns.distplot(chef_df['UNIQUE_MEALS_PURCH'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'r')
#plt.xlabel('UNIQUE_MEALS_PURCH')
#plt.axvline(x = unique_meals_hi)

########################

#plt.subplot(2, 2, 3)
#sns.distplot(chef_df['CONTACTS_W_CUSTOMER_SERVICE'],
#             bins  = 'fd',
#             color = 'g')
#plt.xlabel('CONTACTS_W_CUSTOMER_SERVICE')
#plt.axvline(x = customer_service_hi)

########################
#plt.subplot(2, 2, 4)
#sns.distplot(chef_df['AVG_TIME_PER_SITE_VISIT'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'orange')
#plt.axvline(x = avg_time_hi)
#plt.xlabel('AVG_TIME_PER_SITE_VISIT')
#plt.tight_layout()
#plt.savefig('Apprentice Chef Final Histograms 1 of 3.png')
#plt.show()

########################
########################

#fig, ax = plt.subplots(figsize = (10, 8))
#plt.subplot(2, 2, 1)
#sns.distplot(chef_df['CANCELLATIONS_BEFORE_NOON'],
#             bins  = 'fd',
#             color = 'y')
#plt.xlabel('CANCELLATIONS_BEFORE_NOON')
#plt.axvline(x = cancellations_before_hi)

########################

#plt.subplot(2, 2, 2)
#sns.distplot(chef_df['WEEKLY_PLAN'],
#             bins  = 'fd',
#             color = 'y')
#plt.xlabel('WEEKLY_PLAN')
#plt.axvline(x = weekly_plan_hi)

########################

#plt.subplot(2, 2, 3)
#sns.distplot(chef_df['AVG_PREP_VID_TIME'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'r')
#plt.xlabel('AVG_PREP_VID_TIME')
#plt.axvline(x = prep_vid_time_hi)

########################

#plt.subplot(2, 2, 4)
#sns.distplot(chef_df['EARLY_DELIVERIES'],
#             bins = 10,
#             kde  = False,
#             rug  = True,
#             color = 'orange')
#plt.xlabel('EARLY_DELIVERIES')
#plt.axvline(x = early_deliveries_hi)
#plt.tight_layout()
#plt.savefig('Apprentice Chef Final Histograms 2 of 3.png')
#plt.show()

########################
########################

#fig, ax = plt.subplots(figsize = (10, 8))
#plt.subplot(2, 2, 1)
#sns.distplot(chef_df['LATE_DELIVERIES'],
#             bins  = 'fd',
#             color = 'g')
#plt.xlabel('LATE_DELIVERIES')
#plt.axvline(x = late_deliveries_hi)

########################

#plt.subplot(2, 2, 2)
#sns.distplot(chef_df['MASTER_CLASSES_ATTENDED'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'r')
#plt.xlabel('MASTER_CLASSES_ATTENDED')
#plt.axvline(x = master_classes_hi)

########################

#plt.subplot(2, 2, 3)
#sns.distplot(chef_df['AVG_CLICKS_PER_VISIT'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'r')
#plt.xlabel('AVG_CLICKS_PER_VISIT')
#plt.axvline(x = avg_clicks_hi)
#plt.axvline(x = avg_clicks_lo)

########################

#plt.subplot(2, 2, 4)
#sns.distplot(chef_df['TOTAL_PHOTOS_VIEWED'],
#             bins  = 'fd',
#             kde   = False,
#             rug   = True,
#             color = 'r')
#plt.xlabel('TOTAL_PHOTOS_VIEWED')
#plt.axvline(x = total_photos_hi)
#plt.tight_layout()
#plt.savefig('Apprentice Chef Final Histograms 3 of 3.png')
#plt.show()


##############################################################################
# developing features (columns) for outliers

# Total Meals
chef_df['OUT_TOTAL_MEALS_ORDERED'] = 0
condition_hi = chef_df.loc[0:,'OUT_TOTAL_MEALS_ORDERED'][chef_df['TOTAL_MEALS_ORDERED'] > total_meals_hi]
condition_lo = chef_df.loc[0:,'OUT_TOTAL_MEALS_ORDERED'][chef_df['TOTAL_MEALS_ORDERED'] <= total_meals_lo]

chef_df['OUT_TOTAL_MEALS_ORDERED'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

chef_df['OUT_TOTAL_MEALS_ORDERED'].replace(to_replace = condition_lo,
                                value      = 1,
                                inplace    = True)
# Unique Meals
chef_df['OUT_UNIQUE_MEALS_PURCH'] = 0
condition_hi = chef_df.loc[0:,'OUT_UNIQUE_MEALS_PURCH'][chef_df['UNIQUE_MEALS_PURCH'] > unique_meals_hi]

chef_df['OUT_UNIQUE_MEALS_PURCH'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Customer Service
chef_df['OUT_CONTACTS_W_CUSTOMER_SERVICE'] = 0
condition_hi = chef_df.loc[0:,'OUT_CONTACTS_W_CUSTOMER_SERVICE'][chef_df['CONTACTS_W_CUSTOMER_SERVICE'] > customer_service_hi]

chef_df['OUT_CONTACTS_W_CUSTOMER_SERVICE'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Cancellations before Noon
chef_df['OUT_CANCELLATIONS_BEFORE_NOON'] = 0
condition_hi = chef_df.loc[0:,'OUT_CANCELLATIONS_BEFORE_NOON'][chef_df['CANCELLATIONS_BEFORE_NOON'] > cancellations_before_hi]

chef_df['OUT_CANCELLATIONS_BEFORE_NOON'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Weekly Plan
chef_df['OUT_WEEKLY_PLAN'] = 0
condition_hi = chef_df.loc[0:,'OUT_WEEKLY_PLAN'][chef_df['WEEKLY_PLAN'] > weekly_plan_hi]

chef_df['OUT_WEEKLY_PLAN'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Prep Vid Time
chef_df['OUT_AVG_PREP_VID_TIME'] = 0
condition_hi = chef_df.loc[0:,'OUT_AVG_PREP_VID_TIME'][chef_df['AVG_PREP_VID_TIME'] > prep_vid_time_hi]

chef_df['OUT_AVG_PREP_VID_TIME'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Late Deliveries
chef_df['OUT_LATE_DELIVERIES'] = 0
condition_hi = chef_df.loc[0:,'OUT_LATE_DELIVERIES'][chef_df['LATE_DELIVERIES'] > late_deliveries_hi]

chef_df['OUT_LATE_DELIVERIES'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)
# Early Deliveries
chef_df['OUT_EARLY_DELIVERIES'] = 0
condition_hi = chef_df.loc[0:,'OUT_EARLY_DELIVERIES'][chef_df['EARLY_DELIVERIES'] > early_deliveries_hi]

chef_df['OUT_EARLY_DELIVERIES'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Avg Time
chef_df['OUT_AVG_TIME_PER_SITE_VISIT'] = 0
condition_hi = chef_df.loc[0:,'OUT_AVG_TIME_PER_SITE_VISIT'][chef_df['AVG_TIME_PER_SITE_VISIT'] > avg_time_hi]

chef_df['OUT_AVG_TIME_PER_SITE_VISIT'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Master Classes
chef_df['OUT_MASTER_CLASSES_ATTENDED'] = 0
condition_hi = chef_df.loc[0:,'OUT_MASTER_CLASSES_ATTENDED'][chef_df['MASTER_CLASSES_ATTENDED'] > master_classes_hi]

chef_df['OUT_MASTER_CLASSES_ATTENDED'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Average Clicks
chef_df['OUT_AVG_CLICKS_PER_VISIT'] = 0
condition_hi = chef_df.loc[0:,'OUT_AVG_CLICKS_PER_VISIT'][chef_df['AVG_CLICKS_PER_VISIT'] > avg_clicks_hi]
condition_lo = chef_df.loc[0:,'OUT_AVG_CLICKS_PER_VISIT'][chef_df['AVG_CLICKS_PER_VISIT'] < avg_clicks_lo]


chef_df['OUT_AVG_CLICKS_PER_VISIT'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)
#chef_df['OUT_AVG_CLICKS_PER_VISIT'].replace(to_replace = condition_lo,
#                                value      = 1,
#                                inplace    = True)

# Total Photos
#chef_df['OUT_TOTAL_PHOTOS_VIEWED'] = 0
#condition_hi = chef_df.loc[0:,'OUT_TOTAL_PHOTOS_VIEWED'][chef_df['TOTAL_PHOTOS_VIEWED'] > total_photos_hi]

#chef_df['OUT_TOTAL_PHOTOS_VIEWED'].replace(to_replace = condition_hi,
#                                value      = 1,
#                                inplace    = True)

Calculating fields based on existing features. The following new features could be interesting to look at:
    
    1. Percentage of early deliveries (early deliveries / total deliveries)
    2. Percentage of late deliveries (late deliveries / total deliveries)
    3. Did the customer ever give a rating? (median_rating > 0)
    4. Percentage of meals from weekly plan (assuming basic weekly plan = 3 meals)
    5. Did customer attend cooking class? (0 and 1 encoding)
    6. Average meals per month
    7. Average revenue per meal
    8. Flag Follower customers (people that follow 35 % or more of their meal recommendations
    9. Likely ordered drinks
    
    

In [10]:
# Feature 1: percentage of early deliveries
for index, col in chef_df.iterrows():
    chef_df.loc[index,'PCT_EARLY_DELIVERIES'] = chef_df.loc[index,'EARLY_DELIVERIES'] / chef_df.loc[index,'TOTAL_MEALS_ORDERED']

    
# Feature 2:
for index, col in chef_df.iterrows():
    chef_df.loc[index,'PCT_LATE_DELIVERIES'] = chef_df.loc[index,'LATE_DELIVERIES'] / chef_df.loc[index,'TOTAL_MEALS_ORDERED']

    
# Feature 3: did the customer ever give a rating?
#for index, col in chef_df.iterrows():
#    if chef_df.loc[index, 'MEDIAN_MEAL_RATING'] > 0:
#        chef_df.loc[index, 'EVER_RATED'] = int(1)
#    else:
#        chef_df.loc[index, 'EVER_RATED'] = int(0)


# Feature 4: percentage of meals from weekly plan
for index, col in chef_df.iterrows():
    chef_df.loc[index,'PCT_WEEKLY_PLAN'] = chef_df.loc[index,'WEEKLY_PLAN']*3 / chef_df.loc[index,'TOTAL_MEALS_ORDERED']


# Feature 5: did the customer attend cooking class?
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'MASTER_CLASSES_ATTENDED'] > 0:
        chef_df.loc[index, 'ATTENDED_CLASS'] = int(1)
    else:
        chef_df.loc[index, 'ATTENDED_CLASS'] = int(0)
    
chef_df['ATTENDED_CLASS'] = chef_df['ATTENDED_CLASS'].astype('int64')  


# Feature 6: average meals per month
for index, col in chef_df.iterrows():
    chef_df.loc[index,'AVG_MEALS_MONTH'] = chef_df.loc[index,'TOTAL_MEALS_ORDERED'] / 12
    
# Feature 7: avg price per order
for index, col in chef_df.iterrows():
    chef_df.loc[index,'AVG_ORDER_REV'] = chef_df.loc[index,'REVENUE'] / chef_df.loc[index,'TOTAL_MEALS_ORDERED']
    
        
# Feature 8: is_follower
for index, col in chef_df.iterrows():
    if chef_df.loc[index, 'FOLLOWED_RECOMMENDATIONS_PCT'] > 35:
        chef_df.loc[index, 'IS_FOLLOWER'] = int(1)
    else:
        chef_df.loc[index, 'IS_FOLLOWER'] = int(0)

In [11]:
# chef_df_corr_cross = chef_df.corr()
# chef_df_corr_cross['CROSS_SELL_SUCCESS'].sort_values(ascending = False)

In [12]:
#for val in chef_data:
#    print(f"{val} +")

The following logistic regression model is the result of multiple iterations. The remaining variables are considered significant are used in the modeling process 

In [13]:
# creating explanatory and target variable for modelling
chef_data_full =  chef_df.loc[:, ['REVENUE', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH',
       'CONTACTS_W_CUSTOMER_SERVICE', 'PRODUCT_CATEGORIES_VIEWED',
       'AVG_TIME_PER_SITE_VISIT', 'MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON',
       'CANCELLATIONS_AFTER_NOON', 'TASTES_AND_PREFERENCES', 'MOBILE_LOGINS',
       'PC_LOGINS', 'WEEKLY_PLAN', 'EARLY_DELIVERIES', 'LATE_DELIVERIES',
       'PACKAGE_LOCKER', 'REFRIGERATED_LOCKER', 'FOLLOWED_RECOMMENDATIONS_PCT',
       'AVG_PREP_VID_TIME', 'LARGEST_ORDER_SIZE', 'MASTER_CLASSES_ATTENDED',
       'MEDIAN_MEAL_RATING', 'AVG_CLICKS_PER_VISIT', 'TOTAL_PHOTOS_VIEWED',
       'MISSING_FAMILY_NAME','IS_SPAM', 'IS_PROFESSIONAL', 'PCT_EARLY_DELIVERIES',
       'PCT_LATE_DELIVERIES', 'PCT_WEEKLY_PLAN','IS_FINANCIAL','IS_TECH',
       'ATTENDED_CLASS', 'AVG_MEALS_MONTH', 'AVG_ORDER_REV',
       'IS_FOLLOWER']]
                              
chef_target_full = chef_df.loc[:,'CROSS_SELL_SUCCESS']
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(chef_data_full, chef_target_full, 
                                                    test_size = 0.25, random_state = 822, 
                                                    stratify = chef_target_full)

In [14]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ CROSS_SELL_SUCCESS ~
                                    MOBILE_NUMBER +
                                    CANCELLATIONS_BEFORE_NOON +
                                    CANCELLATIONS_AFTER_NOON +
                                    TASTES_AND_PREFERENCES +
                                    PC_LOGINS +
                                    FOLLOWED_RECOMMENDATIONS_PCT +
                                    MISSING_FAMILY_NAME +
                                    IS_SPAM +
                                    IS_PROFESSIONAL + 
                                    IS_TECH
                                    """,
                                     data    = pd.concat([X_train_full, y_train_full], axis = 1))


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary()

Optimization terminated successfully.
         Current function value: 0.443816
         Iterations 7


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1448.0
Method:,MLE,Df Model:,10.0
Date:,"Wed, 05 Feb 2020",Pseudo R-squ.:,0.2932
Time:,23:35:13,Log-Likelihood:,-647.53
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,4.605e-109

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.3404,0.319,-4.201,0.000,-1.966,-0.715
MOBILE_NUMBER,0.7582,0.199,3.817,0.000,0.369,1.148
CANCELLATIONS_BEFORE_NOON,0.2438,0.049,4.960,0.000,0.147,0.340
CANCELLATIONS_AFTER_NOON,-0.2844,0.152,-1.868,0.062,-0.583,0.014
TASTES_AND_PREFERENCES,0.5127,0.148,3.466,0.001,0.223,0.803
PC_LOGINS,-0.4559,0.131,-3.468,0.001,-0.714,-0.198
FOLLOWED_RECOMMENDATIONS_PCT,0.0565,0.004,14.486,0.000,0.049,0.064
MISSING_FAMILY_NAME,1.4252,0.477,2.986,0.003,0.490,2.361
IS_SPAM,-1.2013,0.176,-6.824,0.000,-1.546,-0.856


The results from the logistic regressions are used to classify features as significant.

During iteration, we have seen that the variable "IS_FOLLOWER" has a very high coefficient. IS_FOLLOWER and IS_TECH in combination is specified as variable set "short"

In [15]:
# explanatory sets from last session

# creating a dictionary to store candidate models

feature_dict = {

 # full model
 'logit_full'   : ['REVENUE', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH',
       'CONTACTS_W_CUSTOMER_SERVICE', 'PRODUCT_CATEGORIES_VIEWED',
       'AVG_TIME_PER_SITE_VISIT', 'MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON',
       'CANCELLATIONS_AFTER_NOON', 'TASTES_AND_PREFERENCES', 'MOBILE_LOGINS',
       'PC_LOGINS', 'WEEKLY_PLAN', 'EARLY_DELIVERIES', 'LATE_DELIVERIES',
       'PACKAGE_LOCKER', 'REFRIGERATED_LOCKER', 'FOLLOWED_RECOMMENDATIONS_PCT',
       'AVG_PREP_VID_TIME', 'LARGEST_ORDER_SIZE', 'MASTER_CLASSES_ATTENDED',
       'MEDIAN_MEAL_RATING', 'AVG_CLICKS_PER_VISIT', 'TOTAL_PHOTOS_VIEWED',
       'MISSING_FAMILY_NAME','IS_SPAM', 'IS_PROFESSIONAL', 'PCT_EARLY_DELIVERIES',
       'PCT_LATE_DELIVERIES', 'PCT_WEEKLY_PLAN','IS_FINANCIAL','IS_TECH',
       'ATTENDED_CLASS', 'AVG_MEALS_MONTH', 'AVG_ORDER_REV',
       'IS_FOLLOWER'],
 
 # significant variables only
 'logit_sig'    : ['CANCELLATIONS_BEFORE_NOON', 'CANCELLATIONS_AFTER_NOON',
                   'TASTES_AND_PREFERENCES',
                  'MISSING_FAMILY_NAME','IS_SPAM','IS_PROFESSIONAL','IS_FOLLOWER','IS_TECH'],
    
  # variables selected through feature importance graph (tuned tree)
 'logit_sig_tree' : ['IS_SPAM','IS_FOLLOWER','IS_PROFESSIONAL','CANCELLATIONS_BEFORE_NOON',
                     'AVG_PREP_VID_TIME','MOBILE_NUMBER','PCT_WEEKLY_PLAN','AVG_TIME_PER_SITE_VISIT',
                     'AVG_CLICKS_PER_VISIT','AVG_ORDER_REV']

}

Besides looking at the correlation and significance of individual explanatory variables with our target variable ("CROSS_SALE_SUCCESS") we can gain additional insights from building a classification tree. The splits that are chosen by the tree model can give valuable insights into trends that can be featured out in separate variables.


Using gridsearch, we tune the Hyperparameter of a DecisionTreeClassifier to keep the tree at a reasonable and interpretable size.

In [16]:
# declaring a hyperparameter space

# max_depth_space  = pd.np.arange(1, 10, 1)
# splitter_space = ['best','random']
# min_samples_leaf_space = pd.np.arange(1, 100, 1)
# criterion_space = ['gini','entropy']

# creating a hyperparameter grid
# param_grid = {'max_depth'          : max_depth_space,
#               'min_samples_leaf'   : min_samples_leaf_space,
#               'criterion'          : criterion_space,
#               'splitter'           : splitter_space}



# INSTANTIATING the model object without hyperparameters
# class_tree = DecisionTreeClassifier()



# GridSearchCV object
# class_tree_cv = GridSearchCV(estimator  = class_tree,
#                            param_grid = param_grid,
#                            cv         = 3,
#                            scoring    = make_scorer(roc_auc_score,
#                                                    needs_threshold = False))

# creating explanatory and target variable
# chef_data_tree = chef_df.drop(labels = ['CROSS_SELL_SUCCESS','NAME','FAMILY_NAME','FIRST_NAME','EMAIL','DOMAIN','EMAIL_ADDRESS','IS_FOLLOWER'], axis = 1)
# chef_target_tree = chef_df.loc[:,'CROSS_SELL_SUCCESS']
# X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(chef_data_tree, chef_target_tree, 
#                                                     test_size = 0.25, random_state = 822, 
#                                                     stratify = chef_target_tree)

# FITTING to the FULL DATASET (due to cross-validation)
# class_tree_cv.fit(chef_data_tree, chef_target_tree)



# printing the optimal parameters and best score
# print("Tuned Parameters  :", class_tree_cv.best_params_)
# print("Tuned CV AUC      :", class_tree_cv.best_score_.round(4))





The hyperparameter tuning results in the following optimal parameters: 
'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 37, 'splitter': 'best'

In [17]:
# creating explanatory and target variable
chef_data_tree = chef_df.drop(labels = ['CROSS_SELL_SUCCESS','NAME','FAMILY_NAME','FIRST_NAME','EMAIL','DOMAIN','EMAIL_ADDRESS','IS_FOLLOWER'], axis = 1)
chef_target_tree = chef_df.loc[:,'CROSS_SELL_SUCCESS']
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(chef_data_tree, chef_target_tree, 
                                                    test_size = 0.25, random_state = 822, 
                                                    stratify = chef_target_tree)


# INSTANTIATING a classification tree model with tuned values
class_tree_tuned = DecisionTreeClassifier(criterion = 'entropy',
                                          max_depth = 7,
                                         min_samples_leaf = 37,
                                         splitter = 'best')

class_tree_tuned_fit = class_tree_tuned.fit(X_train_tree, y_train_tree)


# PREDICTING based on the testing set
class_tree_tuned_pred = class_tree_tuned_fit.predict(X_test_tree)



# SCORING the results
print('Training ACCURACY:', class_tree_tuned.score(X_train_tree, y_train_tree).round(4))
print('Testing  ACCURACY:', class_tree_tuned.score(X_test_tree, y_test_tree).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_tree, y_score = class_tree_tuned_pred).round(4))

# displaying the tree
#display_tree(tree       = class_tree_tuned,
#             feature_df = X_train_tree)

Training ACCURACY: 0.8053
Testing  ACCURACY: 0.7988
AUC Score        : 0.7655


On top of the accuracy measurements, we also look into the confusion matrix. 
In our case we need to look at two cases:
    1. We predict a customer to buy "halfway there" but the customer doesn't (false positive)
    2. We predict a customer not to buy "halfway there" but the customer buys "halfway there" (false negative)
    
When launching a new product, we don't want to miss out on sales opportunities and rather offer the product to a couple non interested customers. Hence, it is better to accept a higher number of false positives than false negatives. 


The following model selection will always consider the number of false negatives (number of missed opportunities)



In [18]:
# calling the visual_cm function
#visual_cm(true_y = y_test_tree,
#          pred_y = class_tree_tuned_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

Writing the tree performance into a dataframe that will store all results of subsequent modelling attempts.

In [19]:
performance_df = [['Model', 'Training Accuracy','Testing Accuracy', 'AUC Value',
                      'False Positives',' False Negatives']]
# saving the results
performance_df.append(['Classification Tree Tuned',
                          class_tree_tuned.score(X_train_tree, y_train_tree).round(4),
                          class_tree_tuned.score(X_test_tree, y_test_tree).round(4),
                          roc_auc_score(y_true  = y_test_tree, y_score = class_tree_tuned_pred).round(4),
                         47,
                         51])

performance_df = pd.DataFrame(performance_df)
performance_df.columns = ['Model', 'Training Accuracy','Testing Accuracy', 'AUC Value',
                      'False Positives','False Negatives']
performance_df.drop(index = 0, axis = 0,inplace = True)


# saving the DataFrame to Excel
performance_df.to_excel('Classification Model Performance.xlsx',
                              index = False)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
1,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51


Feature importances can be helpful to select features during the further analysis

In [20]:
feature_importances = pd.concat([pd.DataFrame(X_train_tree.columns),
                                 pd.DataFrame(class_tree_tuned.feature_importances_)],axis = 1)
feature_importances.columns = ['Feature','Importance']
feature_importances[feature_importances['Importance'] > 0].sort_values(by = 'Importance',ascending = False)

Unnamed: 0,Feature,Importance
17,FOLLOWED_RECOMMENDATIONS_PCT,0.816821
26,IS_SPAM,0.067952
27,IS_PROFESSIONAL,0.019042
5,AVG_TIME_PER_SITE_VISIT,0.018515
7,CANCELLATIONS_BEFORE_NOON,0.016505
18,AVG_PREP_VID_TIME,0.012248
43,PCT_WEEKLY_PLAN,0.011104
11,PC_LOGINS,0.009949
14,LATE_DELIVERIES,0.008445
2,UNIQUE_MEALS_PURCH,0.007519


Building actual tree model using the optimal parameters and only significant x variables

In [21]:

chef_data_sig   =  chef_df.loc[ : , feature_dict['logit_sig']]
#chef_data_sig   =  chef_df.loc[ : , feature_dict['logit_sig_tree']]
#chef_data_short   =  chef_df.loc[ : , ['IS_FOLLOWER','IS_TECH']]
chef_target_sig =  chef_df.loc[ : , 'CROSS_SELL_SUCCESS']


# This is the exact code we were using before
X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(
            chef_data_sig,
            chef_target_sig,
            test_size    = 0.25,
            random_state = 802,
            stratify     = chef_target_sig)



# INSTANTIATING a classification tree model with tuned values
class_tree_pruned = DecisionTreeClassifier(criterion = 'entropy',
                                          max_depth = 7,
                                         min_samples_leaf = 37,
                                         splitter = 'best')

class_tree_pruned_fit = class_tree_pruned.fit(X_train_sig,y_train_sig)
# PREDICTING based on the testing set
class_tree_pruned_pred = class_tree_pruned_fit.predict(X_test_sig)



# SCORING the results
print('Training ACCURACY:', class_tree_pruned.score(X_train_sig, y_train_sig).round(4))
print('Testing  ACCURACY:', class_tree_pruned.score(X_test_sig, y_test_sig).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_sig, y_score = class_tree_pruned_pred).round(4))

# displaying the tree
#display_tree(tree       = class_tree_pruned,
#             feature_df = X_train_sig)

Training ACCURACY: 0.7951
Testing  ACCURACY: 0.7885
AUC Score        : 0.7766


In [22]:
# calling the visual_cm function
#visual_cm(true_y = y_test_sig,
#          pred_y = class_tree_pruned_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

In [23]:
training_acc_tree_pruned = class_tree_pruned.score(X_train_sig, y_train_sig).round(4)
testing_acc_tree_pruned = class_tree_pruned.score(X_test_sig, y_test_sig).round(4)
auc_score_tree_pruned = roc_auc_score(y_true  = y_test_sig, y_score = class_tree_pruned_pred).round(4)
false_positives_tree_pruned = 80
false_negatives_tree_pruned = 27

performance_df = performance_df.append(
                          {'Model'             : 'Classification Tree Pruned',
                          'Training Accuracy'  : training_acc_tree_pruned,
                          'Testing Accuracy'   : testing_acc_tree_pruned,
                          'AUC Value'          : auc_score_tree_pruned,
                          'False Positives'    : false_positives_tree_pruned,
                          'False Negatives'    : false_negatives_tree_pruned},
                          ignore_index = True)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
0,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51
1,Classification Tree Pruned,0.7951,0.7885,0.7766,80,27


The list of significant variables is added used to build a logistic regression model with scikit learn

In [24]:

chef_data_sig   =  chef_df.loc[ : , feature_dict['logit_sig']]
#chef_data_sig   =  chef_df.loc[ : , feature_dict['logit_sig_tree']]
#chef_data_short   =  chef_df.loc[ : , ['IS_FOLLOWER','IS_TECH']]
chef_target_sig =  chef_df.loc[ : , 'CROSS_SELL_SUCCESS']


# This is the exact code we were using before
X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(
            chef_data_sig,
            chef_target_sig,
            test_size    = 0.25,
            random_state = 802,
            stratify     = chef_target_sig)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            max_iter = 1000,
                            C = 1,
                            random_state = 802)


# FITTING the training data
logreg_fit = logreg.fit(X_train_sig, y_train_sig)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test_sig)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(X_train_sig, y_train_sig).round(4))
print('Testing  ACCURACY:', logreg_fit.score(X_test_sig, y_test_sig).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_sig,
                                          y_score = logreg_pred).round(4))

Training ACCURACY: 0.7978
Testing  ACCURACY: 0.7864
AUC Score        : 0.7463


In [25]:
# calling the visual_cm function
#visual_cm(true_y = y_test_sig,
#          pred_y = logreg_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

The confusion matrix shows the following results:

- 53 customers are predicted to buy although they actually didn't. (False Positives)
- 51 Customers are predicted to not buy, although they actually did (False Negatives)


In [26]:
training_acc_logreg_sig = logreg_fit.score(X_train_sig, y_train_sig).round(4)
testing_acc_logreg_sig = logreg_fit.score(X_test_sig, y_test_sig).round(4)
auc_score_logreg_sig = roc_auc_score(y_true  = y_test_sig,
                                          y_score = logreg_pred).round(4)
false_positives_logreg_sig = 53
false_negatives_logreg_sig = 51

performance_df = performance_df.append(
                          {'Model'             : 'Logistic Regression Significant',
                          'Training Accuracy'  : training_acc_logreg_sig,
                          'Testing Accuracy'   : testing_acc_logreg_sig,
                          'AUC Value'          : auc_score_logreg_sig,
                          'False Positives'    : false_positives_logreg_sig,
                          'False Negatives'    : false_negatives_logreg_sig},
                          ignore_index = True)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
0,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51
1,Classification Tree Pruned,0.7951,0.7885,0.7766,80,27
2,Logistic Regression Significant,0.7978,0.7864,0.7463,53,51


The siginificant variables give an acceptable result, not a perfect one however. The false negatives increased which in our case means lost potential coss sales opportunity. Based on our findings in the correlation matrix, combined with the fact that the first split in a tree was IS_FOLLOWER and a value of > 35 % created a homogeneous group of converters, we build the following logistic regression with only 2 variables.
- IS_FOLLOWER
- IS_TECH

In [27]:
# train/test split with the 2 selected variables

chef_data_short   =  chef_df.loc[ : , ['IS_FOLLOWER','IS_TECH']]
chef_target_short =  chef_df.loc[ : , 'CROSS_SELL_SUCCESS']


# This is the exact code we were using before
X_train_short, X_test_short, y_train_short, y_test_short = train_test_split(
            chef_data_short,
            chef_target_short,
            test_size    = 0.25,
            random_state = 802,
            stratify     = chef_target_short)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            max_iter = 1000,
                            C = 1,
                            random_state = 802)


# FITTING the training data
logreg_fit = logreg.fit(X_train_short, y_train_short)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test_short)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(X_train_short, y_train_short).round(4))
print('Testing  ACCURACY:', logreg_fit.score(X_test_short, y_test_short).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_short,
                                          y_score = logreg_pred).round(4))

Training ACCURACY: 0.7409
Testing  ACCURACY: 0.7413
AUC Score        : 0.8012


In [28]:
# calling the visual_cm function
#visual_cm(true_y = y_test_short,
#          pred_y = logreg_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

In [29]:
training_acc_logreg_short = logreg_fit.score(X_train_short, y_train_short).round(4)
testing_acc_logreg_short = logreg_fit.score(X_test_short, y_test_short).round(4)
auc_score_logreg_short = roc_auc_score(y_true  = y_test_short,
                                          y_score = logreg_pred).round(4)
false_positives_logreg_short = 121
false_negatives_logreg_short = 5

performance_df = performance_df.append(
                          {'Model'             : 'Logistic Regression Short',
                          'Training Accuracy'  : training_acc_logreg_short,
                          'Testing Accuracy'   : testing_acc_logreg_short,
                          'AUC Value'          : auc_score_logreg_short,
                          'False Positives'    : false_positives_logreg_short,
                          'False Negatives'    : false_negatives_logreg_short},
                          ignore_index = True)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
0,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51
1,Classification Tree Pruned,0.7951,0.7885,0.7766,80,27
2,Logistic Regression Significant,0.7978,0.7864,0.7463,53,51
3,Logistic Regression Short,0.7409,0.7413,0.8012,121,5


Tuning the Hyperparameters of the short logistic Regression model

In [30]:
########################################
# GridSearchCV
########################################

# declaring a hyperparameter space
# C_space          = pd.np.arange(0.1, 3.0, 0.1)
# warm_start_space = [True, False]
# 

# creating a hyperparameter grid
# param_grid = {'C'          : C_space,
#               'warm_start' : warm_start_space}
# 

# INSTANTIATING the model object without hyperparameters
# lr_tuned = LogisticRegression(solver = 'lbfgs',
#                              max_iter = 1000,
#                              random_state = 802)


# GridSearchCV object
# lr_tuned_cv = GridSearchCV(estimator  = lr_tuned,
#                            param_grid = param_grid,
#                            cv         = 3,
#                            scoring    = make_scorer(roc_auc_score,
#                                                    needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
# lr_tuned_cv.fit(chef_data_short, chef_target_short)


# PREDICT step is not needed


# printing the optimal parameters and best score
# print("Tuned Parameters  :", lr_tuned_cv.best_params_)
# print("Tuned CV AUC   #    :", lr_tuned_cv.best_score_.round(4))

The output of the Gridsearch is C: 0.7 and warm_start = True. 

In [31]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
# lr_tuned = lr_tuned_cv.best_estimator_


# FIT step is not needed


# PREDICTING based on the testing set
# lr_tuned_pred = lr_tuned.predict(X_test_short)


# SCORING the results
# print('Training ACCURACY:', lr_tuned.score(X_train_short, y_train_short).round(4))
# print('Testing  ACCURACY:', lr_tuned.score(X_test_short, y_test_short).round(4))
# print('AUC Score        :', roc_auc_score(y_true  = y_test_short,
#                                  y_score = lr_tuned_pred).round(4))

In [32]:
# calling the visual_cm function
#visual_cm(true_y = y_test_short,
#          pred_y = lr_tuned_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

Both, the AUC score and the confusion matrix show that there was no improvement of the model based on the tuned Hyperparameters. The AUC score, as well as the false positives and false negatives stay exactly the same.

----------------------------------------

In the following section, we check the performance of additional classification models.

Checking the Performance of a Ridge Classifer


In [33]:

# INSTANTIATING a ridge regression model
ridge = RidgeClassifier(alpha = 150)


# FITTING the training data
ridge_fit = ridge.fit(X_train_sig, y_train_sig)


# PREDICTING based on the testing set
ridge_pred = ridge_fit.predict(X_test_sig)


# SCORING the results
print('Training ACCURACY:', ridge_fit.score(X_train_sig, y_train_sig).round(4))
print('Testing  ACCURACY:', ridge_fit.score(X_test_sig, y_test_sig).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_sig,
                                          y_score = ridge_pred).round(4))

Training ACCURACY: 0.7862
Testing  ACCURACY: 0.7885
AUC Score        : 0.7173


Creating the confusion matrix of the ridge classifier

In [34]:
# calling the visual_cm function
#visual_cm(true_y = y_test_sig,
#          pred_y = ridge_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

The ridge classifier results in 
- training accuracy: 0.7882
- testing accuracy: 0.7864
- AUC Score: 0.7226
- False Positives: 33
- False Negatives: 71

In [35]:
training_acc_ridge = ridge_fit.score(X_train_sig, y_train_sig).round(4)
testing_acc_ridge = ridge_fit.score(X_test_sig, y_test_sig).round(4)
auc_score_ridge = roc_auc_score(y_true  = y_test_sig,
                                          y_score = ridge_pred).round(4)
false_positives_ridge = 33
false_negatives_ridge = 71

performance_df = performance_df.append(
                          {'Model'             : 'Ridge Classifier',
                          'Training Accuracy'  : training_acc_ridge,
                          'Testing Accuracy'   : testing_acc_ridge,
                          'AUC Value'          : auc_score_ridge,
                          'False Positives'    : false_positives_ridge,
                          'False Negatives'    : false_negatives_ridge},
                          ignore_index = True)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
0,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51
1,Classification Tree Pruned,0.7951,0.7885,0.7766,80,27
2,Logistic Regression Significant,0.7978,0.7864,0.7463,53,51
3,Logistic Regression Short,0.7409,0.7413,0.8012,121,5
4,Ridge Classifier,0.7862,0.7885,0.7173,33,71


Checking the Performance of a Perceptron Classifier. Instead of testing on the significant variables, we test the Perceptron Classifier on the two variables IS_FOLLOWER and IS_TECH


In [36]:

# INSTANTIATING a Perceptron classification model
perceptron = Perceptron()


# FITTING the training data
perceptron_fit = perceptron.fit(X_train_short, y_train_short)


# PREDICTING based on the testing set
perceptron_pred = perceptron_fit.predict(X_test_short)


# SCORING the results
print('Training ACCURACY:', perceptron_fit.score(X_train_short, y_train_short).round(4))
print('Testing  ACCURACY:', perceptron_fit.score(X_test_short, y_test_short).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_short,
                                          y_score = perceptron_pred).round(4))

Training ACCURACY: 0.7409
Testing  ACCURACY: 0.7413
AUC Score        : 0.8012


The result is the same as with the logistic regression model. Hence, we do not append the performance of the Perceptron classifier

The perceptron classifier results in 
- training accuracy: 0.7409
- testing accuracy: 0.7413
- AUC Score: 0.8012


Checking the Performance of a Passive Aggressive Classifier

In [37]:

# INSTANTIATING a ridge regression model
pac = PassiveAggressiveClassifier()


# FITTING the training data
pac_fit = pac.fit(X_train_sig, y_train_sig)


# PREDICTING based on the testing set
pac_pred = pac_fit.predict(X_test_sig)


# SCORING the results
print('Training ACCURACY:', pac_fit.score(X_train_sig, y_train_sig).round(4))
print('Testing  ACCURACY:', pac_fit.score(X_test_sig, y_test_sig).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_sig,
                                          y_score = pac_pred).round(4))

Training ACCURACY: 0.7101
Testing  ACCURACY: 0.7187
AUC Score        : 0.7914


In [38]:
# calling the visual_cm function
#visual_cm(true_y = y_test_sig,
#          pred_y = pac_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

The Passive Aggressive Classifier is not able to predict Cross Sell Success. We do not append the performance to our performance comparison dataframe

The passive aggressive classifier results in 
- training accuracy: 0.6792
- testing accuracy: 0.6838
- AUC Score: 0.5064
- False Positives: 0
- False Negatives: 154

WE can use a KNN Classifier to test for cross_sales_success. 

First we need to find the optimal number neighbors, using the user defined function "opt_neighbors"

In [39]:
# determining the optimal number of neighbors
#opt_neighbors = optimal_neighbors(X_data = X_train_sig, y_data = y_train_sig, response_type = 'class')

The function returns 13 as the optimal number of neighbors. Subsequently we plug n = 13 into the KNN classifier

In [40]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(X_train_sig, y_train_sig)


# TRANSFORMING the data
X_scaled     = scaler.transform(chef_data_sig)


# converting to a DataFrame
X_scaled_df  = pd.DataFrame(X_scaled) 


# train-test split with the scaled data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            X_scaled_df,
            chef_target_sig,
            random_state = 802,
            test_size = 0.25,
            stratify = chef_target_sig)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = 13)


# FITTING the training data
knn_fit = knn_opt.fit(X_train_sig, y_train_sig)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(X_test_sig)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(X_train_sig, y_train_sig).round(4))
print('Testing  ACCURACY:', knn_fit.score(X_test_sig, y_test_sig).round(4))
      
print('AUC Score        :', roc_auc_score(y_true  = y_test_sig,
                                          y_score = knn_pred).round(4))

Training ACCURACY: 0.7951
Testing  ACCURACY: 0.7823
AUC Score        : 0.7416


In [41]:
# calling the visual_cm function
#visual_cm(true_y = y_test_sig,
#          pred_y = knn_pred,
#          labels = ['Cross Sell Success', 'Not Cross Sell Success'])

The KNN classifier results in 
- training accuracy: 0.7855
- testing accuracy: 0.7002
- AUC Score: 0.5964
- False Positives: 48
- False Negatives: 58

In [42]:
training_acc_knn = knn_fit.score(X_train_sig, y_train_sig).round(4)
testing_acc_knn = knn_fit.score(X_test_sig, y_test_sig).round(4)
auc_score_knn = roc_auc_score(y_true  = y_test_sig,
                                          y_score = knn_pred).round(4)
false_positives_knn = 48
false_negatives_knn = 58

performance_df = performance_df.append(
                          {'Model'             : 'KNN Classifier',
                          'Training Accuracy'  : training_acc_knn,
                          'Testing Accuracy'   : testing_acc_knn,
                          'AUC Value'          : auc_score_knn,
                          'False Positives'    : false_positives_knn,
                          'False Negatives'    : false_negatives_knn},
                          ignore_index = True)

performance_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,AUC Value,False Positives,False Negatives
0,Classification Tree Tuned,0.8053,0.7988,0.7655,47,51
1,Classification Tree Pruned,0.7951,0.7885,0.7766,80,27
2,Logistic Regression Significant,0.7978,0.7864,0.7463,53,51
3,Logistic Regression Short,0.7409,0.7413,0.8012,121,5
4,Ridge Classifier,0.7862,0.7885,0.7173,33,71
5,KNN Classifier,0.7951,0.7823,0.7416,48,58


In [43]:
# We save the resulting model performance dataframe
#performance_df.to_excel('Performance Summary Final.xlsx')

The final result comparison shows that multiple models perform around an AUC value of 0.8
With tuning models further, using random forests, etc. I would most likely be able to increase the performance. 

Given the business context however, my goal is to keep the model actionable and interpretable. As mentioned earlier, it makes sense to optimize for a low number of false negatives. The fewer customers we predict not to buy but that actually buy, the fewer missed opportunities we have. A higher number of false positives surely decreases the efficiency of the promotional campaign but the effectiveness is still high since we catch almost all sales opportunities. 

Given this reasoning, I select the "Logistic Regression Short" as the best model. With an AUC of >0.8 and only 5 False negatives, this is the best fit for the business usecase. The discrepancy between train and test accuracy also does not indicate an overfit model. Another benefit is that this model only has 2 explanatory variables. The insights and recommendations drawn from this are very actionable. Details can be found in my findings summary.

In [44]:
#Final Model:
print(performance_df.loc[3,:])

Model                Logistic Regression Short
Training Accuracy                       0.7409
Testing Accuracy                        0.7413
AUC Value                               0.8012
False Positives                            121
False Negatives                              5
Name: 3, dtype: object
