Hult Internation Business School 
Assignment: Individual Regression Assignment
Student   : Mosiuwa Tshabalala
Subject   : Machine Learning

In [None]:
#importing the relevant packages for data science essentials, modeling and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

file = 'Apprentice_Chef_Dataset.xlsx'

ac_dataset = pd.read_excel(file)

In [None]:
# Feature Engineering: Removing outliers using standard deviation detection
mean = ac_dataset['TOTAL_PHOTOS_VIEWED'].mean()
std = ac_dataset['TOTAL_PHOTOS_VIEWED'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_acdata = ac_dataset[(ac_dataset['TOTAL_PHOTOS_VIEWED'] < upper) & (ac_dataset['TOTAL_PHOTOS_VIEWED'] > lower)]

mean = ac_dataset['AVG_TIME_PER_SITE_VISIT'].mean()
std = ac_dataset['AVG_TIME_PER_SITE_VISIT'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_acdata = ac_dataset[(ac_dataset['AVG_TIME_PER_SITE_VISIT'] < upper) & (ac_dataset['AVG_TIME_PER_SITE_VISIT'] > lower)]

mean = ac_dataset['CONTACTS_W_CUSTOMER_SERVICE'].mean()
std = ac_dataset['CONTACTS_W_CUSTOMER_SERVICE'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_acdata = ac_dataset[(ac_dataset['CONTACTS_W_CUSTOMER_SERVICE'] < upper) & (ac_dataset['CONTACTS_W_CUSTOMER_SERVICE'] > lower)]

mean = ac_dataset['AVG_PREP_VID_TIME'].mean()
std = ac_dataset['AVG_PREP_VID_TIME'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_acdata = ac_dataset[(ac_dataset['AVG_PREP_VID_TIME'] < upper) & (ac_dataset['AVG_PREP_VID_TIME'] > lower)]

In [None]:
# Feature Engineering: Combining columns that measure the same variable 
ac_dataset['Total_Deliveries'] = ac_dataset['LATE_DELIVERIES'] + ac_dataset['EARLY_DELIVERIES']
ac_dataset['Total_Cancellations'] = ac_dataset['CANCELLATIONS_AFTER_NOON'] + ac_dataset['CANCELLATIONS_BEFORE_NOON']

In [None]:
# Feature Engineering: Initializing the split of email domains
placeholder_lst = []

for index, col in ac_dataset.iterrows(): 
    split_email = ac_dataset.loc[index, 'EMAIL'].split(sep = '@')
    placeholder_lst.append(split_email)
    
email_df = pd.DataFrame(placeholder_lst)

email_df


In [None]:
# Feature Engineering: Creating the email dataframe, indexing & showing counts 
ac_email = pd.read_excel(file)

email_df.columns = ['0', 'Personal_EMAIL']

ac_email = pd.concat([ac_email, email_df['Personal_EMAIL']]
                     , axis = 1)

ac_email.loc[: , 'Personal_EMAIL'].value_counts()

In [None]:
# Feature Engineering: Processing the domains and assigning different groups for correlation analysis
personal_email_domains = ['@gmail.com', '@qq.com', '@yahoo.com', '@protonmail.com','@hotmail.com','@live.com','@aol.com']
professional_email_domains = ['@passport.com', '@intel.com', '@homedepot.com', '@goldmansacs.com', '@cisco.com', '@unitedtech.com', 
                              '@jpmorgan.com', 'pfizer.com', 'visa.com','@walmart.com','@disney.com','@pg.com','@caterpillar.com','@mmm.com',
                              '@verizon.com','@boeing.com','@exxon.com','@travelers.com','@unitedhealth.com','@microsoft.com',
                              '@chevron.com','@ibm.com','@dupont.com','@ge.org','@apple.com','@nike.com','@mcdonalds.com',
                             '@jnj.com','@merck.com', '@cocacola.com','@amex.com']
fake_email_domains = ['@me.com','@msn.com', '@passport.com']

placeholder_lst = []

for domain in ac_email['Personal_EMAIL']:
    
        if '@' + domain in personal_email_domains:
            placeholder_lst.append('Personal')

        elif '@' + domain in professional_email_domains:
            placeholder_lst.append('Professional')
            
        elif '@' + domain in fake_email_domains:
            placeholder_lst.append('Fake')

        else:
                'Unknown'
        
ac_email['domain_group'] = pd.Series(placeholder_lst)

ac_email['domain_group'].value_counts()



In [None]:
# Feature Engineering: Using one hot encoding to create columns and merging new columns into the original dataset
oh_email = pd.get_dummies(ac_email['domain_group'])

ac_email = ac_email.drop('domain_group', axis = 1)

ac_email = ac_email.join([oh_email])

ac_email = ac_dataset.drop(['EMAIL'], axis = 1)

In [None]:
# log transforming Revenue and saving it to the dataset
ac_dataset['log_Revenue'] = np.log10(ac_dataset['REVENUE'])

In [None]:
# dropping categorical variables after they've been encoded or processed
ac_dataset = ac_dataset.drop('NAME', axis = 1)
ac_dataset = ac_dataset.drop('FAMILY_NAME', axis = 1)
ac_dataset = ac_dataset.drop('FIRST_NAME', axis = 1)
ac_dataset = ac_dataset.drop('EMAIL', axis = 1)
ac_dataset = ac_dataset.drop('CANCELLATIONS_BEFORE_NOON', axis = 1)
ac_dataset = ac_dataset.drop('CANCELLATIONS_AFTER_NOON', axis = 1)
ac_dataset = ac_dataset.drop('EARLY_DELIVERIES', axis = 1)
ac_dataset = ac_dataset.drop('LATE_DELIVERIES', axis = 1)

print(ac_dataset.columns)

In [None]:
#initializing model
import sklearn.linear_model

In [None]:
# Model 1: Processing the Lasso Model to fit, split & create predictions 
ac_target = ac_dataset.loc[ : , 'log_Revenue']

x_train, x_test, y_train, y_test = train_test_split(
                ac_dataset,
                ac_target,
                test_size = 0.25,
                random_state = 219)

lasso_model = sklearn.linear_model.Lasso(alpha = 0.000000008, normalize = True)

lasso_fit = lasso_model.fit(x_train, y_train)

lasso_pred = lasso_fit.predict(x_test)

print('Lasso Training Score :', lasso_model.score(x_train, y_train).round(4))
print('Lasso Testing Score :', lasso_model.score(x_test, y_test).round(4))

lasso_train_score = lasso_model.score(x_train, y_train).round(4)
lasso_test_score =  lasso_model.score(x_test, y_test).round(4)

print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

In [None]:
lasso_model_values = zip(ac_dataset.columns, lasso_fit.coef_.round(decimals = 2))

lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]

for val in lasso_model_values:
    lasso_model_lst.append(val)
    
for pair in lasso_model_lst:
    print(pair)

In [None]:
lasso_coeff_sum = 0
for feature, coefficient in lasso_model_lst:
    lasso_coeff_sum += coefficient
    if coefficient == 0:
        lasso_model_lst.remove((feature, coefficient))
        
for pair in lasso_model_lst:
    print(pair)

In [None]:
# Model 2: Processing the ARD Model to fit, split & create predictions 
ard_model = sklearn.linear_model.ARDRegression(normalize = False)

ard_fit = ard_model.fit(x_train, y_train)

ard_predict = ard_fit.predict(x_test)

print('Training Score:', ard_model.score(x_train, y_train).round(4))
print('Test Score:', ard_model.score(x_test, y_test).round(4))

ard_train_score = ard_model.score(x_train, y_train).round(4)
ard_test_score =  ard_model.score(x_test, y_test).round(4)

print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

In [None]:
ard_model_values = zip(ac_dataset.columns, ard_fit.coef_.round(decimals = 5))

ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 2))]

for val in ard_model_values:
    ard_model_lst.append(val)
    
for pair in ard_model_lst:
    print(pair)

In [None]:
ard_coeff_sum = 0
for feature, coefficient in ard_model_lst:
    ard_coeff_sum +=coefficient 
    if coefficient == 0:
        ard_model_lst.remove((feature, coefficient))
for pair in ard_model_lst:
    print(pair)
    

In [None]:
# Model 3: Processing the OLS Model to fit, & presenting results 
lm_OLS = smf.ols(formula = """  log_Revenue ~ TOTAL_PHOTOS_VIEWED  +
                                                  TOTAL_MEALS_ORDERED  +
                                                  UNIQUE_MEALS_PURCH +
                                                  CONTACTS_W_CUSTOMER_SERVICE +
                                                  AVG_PREP_VID_TIME 
                            """,
                        data = ac_dataset)


# telling Python to FIT the data to the blueprint
results = lm_OLS.fit()


# printing a summary of the results
print(results.summary())

In [None]:
# Model 4: Processing the KNN model creating the scaled dataframe
scaler = StandardScaler()

scaler.fit(ac_dataset)

x_scaled = scaler.transform(ac_dataset)

x_scaled_df = pd.DataFrame(x_scaled)

x_scaled_df.describe().round(2)

In [None]:
x_train_knn, x_test_knn, y_train_knn, y_test_knn = train_test_split(
                x_scaled,
                ac_target,
                test_size = 0.25,
                random_state = 219)

In [None]:
# creating lists for training set accuracy and test set accuracy
training_accuracy = []
test_accuracy = []

neighbors_settings = range(1, 21)


for n_neighbors in neighbors_settings:
    clf = KNeighborsRegressor(n_neighbors = n_neighbors)
    clf.fit(x_train_knn, y_train_knn)
    
    training_accuracy.append(clf.score(x_train_knn, y_train_knn))
    
    test_accuracy.append(clf.score(x_test_knn, y_test_knn))

fig, ax = plt.subplots(figsize=(12,8))
plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
plt.plot(neighbors_settings, test_accuracy,     label = "test accuracy")
plt.ylabel("Accuracy of Model")
plt.xlabel("n_neighbors optimal spread")
plt.legend()
plt.show()

opt_neighbors = test_accuracy.index(max(test_accuracy)) + 1
print(f"""The optimal number of neighbors is {opt_neighbors}""")

In [None]:
# Instantiating the model 
knn_stand = KNeighborsRegressor(algorithm = 'auto',
                                n_neighbors = 19)

knn_stand_fit = knn_stand.fit(x_train_knn, y_train_knn)

knn_stand_pred = knn_stand_fit.predict(x_test_knn)

print('KNN Training Score:', knn_stand.score(x_train_knn, y_train_knn).round(4))
print('KNN Testing Score :',  knn_stand.score(x_test_knn, y_test_knn).round(4))

knn_stand_score_train = knn_stand.score(x_train_knn, y_train_knn).round(4)
knn_stand_score_test  = knn_stand.score(x_test_knn, y_test_knn).round(4)

print('KNN Train-Test Gap:', abs(knn_stand_score_train - knn_stand_score_test).round(4))
knn_stand_test_gap = abs(knn_stand_score_train - knn_stand_score_test).round(4)

In [None]:
print(f"""
Model Type   Training Score\tTesting Score\tTrain-Test Gap\tModel Size\tModel Coefficients\tFinal Chosen Model  
========================================================================================================================
Lasso        {lasso_train_score}\t\t{lasso_test_score}\t\t{lasso_test_gap}\t\t{len(lasso_model_lst)}\t\t{lasso_coeff_sum}\t\t\t "No"
ARD          {ard_train_score}\t\t{ard_test_score}\t\t{ard_test_gap}\t\t{len(ard_model_lst)}\t\t{ard_coeff_sum}\t\t\t "No"
*KNN*        {knn_stand_score_train} \t\t{knn_stand_score_test}\t\t{knn_stand_test_gap}\t\t"NA"\t\t"NA"\t\t\t "Yes"
OLS          "NA"\t\t"0.575"\t\t" NA"\t\t"NA"\t\t"6 Variables"\t\t "No"

Based on the model performance we have decided to choose the KNN as our most 
optimal model. The Lasso and ARD models show a high testing score but seem too
good to be true and could possibly signal overfitting of the model.  
The KNN has been optimized through the neighbors settings feature.

""")
