In [1]:
# Load Packages

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Self-defined functions

def precision(label, confusion_matrix):
    col = confusion_matrix[:, label]
    return confusion_matrix[label, label] / col.sum()
    
def recall(label, confusion_matrix):
    row = confusion_matrix[label, :]
    return confusion_matrix[label, label] / row.sum()

def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements 

def precision_macro_average(confusion_matrix):
    rows, columns = confusion_matrix.shape
    sum_of_precisions = 0
    for label in range(rows):
        sum_of_precisions += precision(label, confusion_matrix)
    return sum_of_precisions / rows

def recall_macro_average(confusion_matrix):
    rows, columns = confusion_matrix.shape
    sum_of_recalls = 0
    for label in range(columns):
        sum_of_recalls += recall(label, confusion_matrix)
    return sum_of_recalls / columns

In [5]:
# Read Data

os.chdir('C:\\File\\Uber')
data = pd.read_excel('Uber Bridge Safety Raw Export.xlsx')

In [224]:
# Clean-up
# Remove all character columns
# Remove primary KPI answer columns from predictors

target_primary_1 = data.filter(regex = '8.0. Strongly agree')
target_primary_2 = data.filter(regex = '8.0. Somewhat agree')
data.drop(list(data.filter(regex='8.0')),axis =1, inplace=True)
predict_data = data.select_dtypes(exclude=[object])

In [225]:
# Set Primary KPI column as target
# fill up blank with 0

predict_data['target'] = target_primary_2
predict_data = predict_data[predict_data['target'].notna()]
predict_data.fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [226]:
# Split into train and test

x = predict_data.drop('target',axis=1)
y = predict_data['target']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [218]:
# Build Random Forest, visualize result in confusion matrix

random_clf = RandomForestClassifier(500)
random_clf.fit(x_train,y_train)
y_pred_random = random_clf.predict(x_test)
random_cm = metrics.confusion_matrix(y_test, y_pred_random)
random_cm

array([[2698,  419],
       [ 423, 1458]], dtype=int64)

In [219]:
accuracy(random_cm)

0.8315326130452181

In [154]:
# Build Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier()
gb_clf.fit(x_train,y_train)

y_pred = gb_clf.predict(x_test)

gb_cm = metrics.confusion_matrix(y_test,y_pred)
gb_cm

array([[2667,  450],
       [ 372, 1509]], dtype=int64)

In [155]:
accuracy(gb_cm)

0.8355342136854742

In [144]:
# feature importance
# Primary KPI - Strongly Agree

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(gb_clf,random_state=1).fit(x_test,y_test)
eli5.show_weights(perm,feature_names = x_test.columns.tolist())

Weight,Feature
0.0133  ± 0.0035,8.1. Strongly agree
0.0124  ± 0.0015,8.3. Strongly agree
0.0098  ± 0.0029,8.2. Strongly agree
0.0053  ± 0.0023,8.4. Strongly agree
0.0003  ± 0.0004,10. Annual driver screening for criminal offenses and driving violations
0.0003  ± 0.0005,6. 4-5 trips
0.0002  ± 0.0003,6. 0 trips
0.0002  ± 0.0002,2. Juno
0.0002  ± 0.0005,15.3. Never
0.0001  ± 0.0002,15.0. A few times a week


In [156]:
# feature importance
# Primary KPI - Somewhat Agree

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(gb_clf,random_state=1).fit(x_test,y_test)
eli5.show_weights(perm,feature_names = x_test.columns.tolist())

# People Watch TV a lot tend to somewhat agree with the idea that Uber is committed to Safety

Weight,Feature
0.0215  ± 0.0027,8.3. Somewhat agree
0.0188  ± 0.0060,8.1. Somewhat agree
0.0119  ± 0.0027,8.2. Somewhat agree
0.0111  ± 0.0047,8.4. Somewhat agree
0.0082  ± 0.0020,"13. Connected TV (Smart TV, Roku, Apple TV, Chromecast, Gaming Console, etc.)"
0.0066  ± 0.0021,14. In the last 24 hours
0.0026  ± 0.0018,11. Somewhat favorable
0.0022  ± 0.0008,10. 24/7 support for urgent safety incidents
0.0020  ± 0.0014,10. Technology that senses a possible crash or unexpected stop so Uber can reach out to you and offer assistance
0.0017  ± 0.0004,10. Masking your phone number whenever you call/text your driver so your information stays private
