# Logistic and Delivery : Happy Customer

Data Description:
Y = target attribute (Y) with values indicating 0 (unhappy) and 1 (happy) customers.

X1 = my order was delivered on time.

X2 = contents of my order was as I expected.

X3 = I ordered everything I wanted to order.

X4 = I paid a good price for my order.

X5 = I am satisfied with my courier.

X6 = the app makes ordering easy for me.

Attributes X1 to X6 indicate the responses for each question and have values from 1 to 5 where the smaller number indicates less and the higher number indicates more towards the answer.

Goal(s):

Predict if a customer is happy or not based on the answers they give to questions asked.

Success Metrics:

Reach 73% accuracy score or above.

Find which questions/features are more important when predicting a customer’s happiness.

## Imports

In [47]:
# for data processing and manipulation
import pandas as pd
import numpy as np

# scikit-learn modules for feature selection and model evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, SelectKBest, SelectFromModel, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# libraries for visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

## Load the dataset

In [48]:
# Load the dataset
df = pd.read_csv('ACME-HappinessSurvey2020.csv')

# Print datatypes
print(df.dtypes)

# Describe columns
df.describe(include='all')

Y     int64
X1    int64
X2    int64
X3    int64
X4    int64
X5    int64
X6    int64
dtype: object


Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.547619,4.333333,2.531746,3.309524,3.746032,3.650794,4.253968
std,0.499714,0.8,1.114892,1.02344,0.875776,1.147641,0.809311
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,4.0,2.0,3.0,3.0,3.0,4.0
50%,1.0,5.0,3.0,3.0,4.0,4.0,4.0
75%,1.0,5.0,3.0,4.0,4.0,4.0,5.0
max,1.0,5.0,5.0,5.0,5.0,5.0,5.0


In [49]:
# Preview the dataset
df.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,0,3,3,3,4,2,4
1,0,3,2,3,5,4,3
2,1,5,3,3,3,3,5
3,0,5,4,3,3,3,5
4,0,5,4,3,3,3,5


## Model Performance

In [50]:
# Split feature and target vectors
X = df.drop("Y", 1)
Y = df["Y"]

In [51]:
def fit_model(X, Y):
    '''Use a RLogisticRegression.'''
    
    # define the model to use
    model = RandomForestClassifier(criterion='entropy', random_state=47)
    
    # Train the model
    model.fit(X, Y)
    
    return model

In [52]:
def calculate_metrics(model, X_test, Y_test):
    '''Get model evaluation metrics on the test set.'''
    
    # Get model predictions
    y_predict_r = model.predict(X_test)
    
    # Calculate evaluation metrics for assesing performance of the model.
    acc = accuracy_score(Y_test, y_predict_r)
    roc = roc_auc_score(Y_test, y_predict_r)
    prec = precision_score(Y_test, y_predict_r)
    rec = recall_score(Y_test, y_predict_r)
    f1 = f1_score(Y_test, y_predict_r)
    
    return acc, roc, prec, rec, f1

In [53]:
def train_and_get_metrics(X, Y):
    '''Train model and get evaluation metrics'''
    
    # Split train and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,stratify=Y, random_state = 123)

    # All features of dataset are float values. You normalize all features of the train and test dataset here.
    #scaler = StandardScaler().fit(X_train)
    #X_train_scaled = scaler.transform(X_train)
    #X_test_scaled = scaler.transform(X_test)

    # Call the fit model function to train the model on the normalized features and the diagnosis values
    model = fit_model(X_train, Y_train)

    # Make predictions on test dataset and calculate metrics.
    acc, roc, prec, rec, f1 = calculate_metrics(model, X_test, Y_test)

    return acc, roc, prec, rec, f1

In [54]:
def evaluate_model_on_features(X, Y):
    '''Train model and display evaluation metrics.'''
    
    # Train the model, predict values and get metrics
    acc, roc, prec, rec, f1 = train_and_get_metrics(X, Y)

    # Construct a dataframe to display metrics.
    display_df = pd.DataFrame([[acc, roc, prec, rec, f1, X.shape[1]]], columns=["Accuracy", "ROC", "Precision", "Recall", "F1 Score", 'Feature Count'])
    
    return display_df

In [55]:
# Calculate evaluation metrics
all_features_eval_df = evaluate_model_on_features(X, Y)
all_features_eval_df.index = ['All features']

# Initialize results dataframe
results = all_features_eval_df

# Check the metrics
results.head()

Unnamed: 0,Accuracy,ROC,Precision,Recall,F1 Score,Feature Count
All features,0.653846,0.654762,0.692308,0.642857,0.666667,6


In [56]:
# Remove the features with high correlation to other features
subset_feature_corr_names = ["X1","X2","X5", "X6"]

# Calculate and check evaluation metrics
subset_feature_eval_df = evaluate_model_on_features(df[subset_feature_corr_names], Y)
subset_feature_eval_df.index = ['Subset features']

# Append to results and display
results = results.append(subset_feature_eval_df)
results.head(n=10)

Unnamed: 0,Accuracy,ROC,Precision,Recall,F1 Score,Feature Count
All features,0.653846,0.654762,0.692308,0.642857,0.666667,6
Subset features,0.730769,0.732143,0.769231,0.714286,0.740741,4


In [57]:
df_features = df[["X1","X2","X3","X4","X5","X6"]]
df_features = df_features.replace(1, "Low")
df_features = df_features.replace(2, "Low")
df_features = df_features.replace(3, "Medium")
df_features = df_features.replace(4, "High")
df_features = df_features.replace(5, "High")
print(df_features.head())

       X1      X2      X3      X4      X5      X6
0  Medium  Medium  Medium    High     Low    High
1  Medium     Low  Medium    High    High  Medium
2    High  Medium  Medium  Medium  Medium    High
3    High    High  Medium  Medium  Medium    High
4    High    High  Medium  Medium  Medium    High


In [58]:
LowMedHigh_onehot = pd.get_dummies(df_features)
print(LowMedHigh_onehot.head())

   X1_High  X1_Low  X1_Medium  X2_High  X2_Low  X2_Medium  X3_High  X3_Low  \
0        0       0          1        0       0          1        0       0   
1        0       0          1        0       1          0        0       0   
2        1       0          0        0       0          1        0       0   
3        1       0          0        1       0          0        0       0   
4        1       0          0        1       0          0        0       0   

   X3_Medium  X4_High  X4_Low  X4_Medium  X5_High  X5_Low  X5_Medium  X6_High  \
0          1        1       0          0        0       1          0        1   
1          1        1       0          0        1       0          0        0   
2          1        0       0          1        0       0          1        1   
3          1        0       0          1        0       0          1        1   
4          1        0       0          1        0       0          1        1   

   X6_Low  X6_Medium  
0       0          0 

In [59]:

# Calculate and check evaluation metrics
LowMedHigh_onehot_eval_df = evaluate_model_on_features(LowMedHigh_onehot, Y)
LowMedHigh_onehot_eval_df.index = ['LowMedHigh_onehot']

# Append to results and display
results = results.append(LowMedHigh_onehot_eval_df)
results.head(n=10)

Unnamed: 0,Accuracy,ROC,Precision,Recall,F1 Score,Feature Count
All features,0.653846,0.654762,0.692308,0.642857,0.666667,6
Subset features,0.730769,0.732143,0.769231,0.714286,0.740741,4
LowMedHigh_onehot,0.538462,0.541667,0.583333,0.5,0.538462,18


In [60]:
df_features_1to5 = df[["X1","X2","X3","X4","X5","X6"]]
df_features_1to5 = df_features_1to5.replace(1, "one")
df_features_1to5 = df_features_1to5.replace(2, "two")
df_features_1to5= df_features_1to5.replace(3, "three")
df_features_1to5 = df_features_1to5.replace(4, "four")
df_features_1to5 = df_features_1to5.replace(5, "five")
print(df_features_1to5.head())

      X1     X2     X3     X4     X5     X6
0  three  three  three   four    two   four
1  three    two  three   five   four  three
2   five  three  three  three  three   five
3   five   four  three  three  three   five
4   five   four  three  three  three   five


In [61]:
df_features_1to5_onehot = pd.get_dummies(df_features_1to5)
print(df_features_1to5_onehot.head())

   X1_five  X1_four  X1_one  X1_three  X2_five  X2_four  X2_one  X2_three  \
0        0        0       0         1        0        0       0         1   
1        0        0       0         1        0        0       0         0   
2        1        0       0         0        0        0       0         1   
3        1        0       0         0        0        1       0         0   
4        1        0       0         0        0        1       0         0   

   X2_two  X3_five  ...  X5_five  X5_four  X5_one  X5_three  X5_two  X6_five  \
0       0        0  ...        0        0       0         0       1        0   
1       1        0  ...        0        1       0         0       0        0   
2       0        0  ...        0        0       0         1       0        1   
3       0        0  ...        0        0       0         1       0        1   
4       0        0  ...        0        0       0         1       0        1   

   X6_four  X6_one  X6_three  X6_two  
0        1       

In [62]:
# Calculate and check evaluation metrics
OnetoFive_onehot_eval_df = evaluate_model_on_features(df_features_1to5_onehot, Y)
OnetoFive_onehot_eval_df.index = ['OnetoFive_onehot']

# Append to results and display
results = results.append(OnetoFive_onehot_eval_df)
results.head(n=10)

Unnamed: 0,Accuracy,ROC,Precision,Recall,F1 Score,Feature Count
All features,0.653846,0.654762,0.692308,0.642857,0.666667,6
Subset features,0.730769,0.732143,0.769231,0.714286,0.740741,4
LowMedHigh_onehot,0.538462,0.541667,0.583333,0.5,0.538462,18
OnetoFive_onehot,0.653846,0.654762,0.692308,0.642857,0.666667,29
