In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from collections import Counter

from helper import clean_churn_df, model_baseline, model_baseline_no_cv, score_model_no_cv, score_model
from helper import split_with_dupe_rows_in_train, rf_no_cv_iterx

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sqlalchemy import create_engine

plt.style.use('ggplot')
%matplotlib inline


%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2


In [12]:
#churn_df = pd.read_csv('HR_comma_sep.csv')
engine = create_engine('postgresql://nmp256@localhost:5432/churn')
churn_df = pd.read_sql_query('select * from hr',con=engine)
churn_df = churn_df.drop('index',axis=1)
X_train, X_val, X_holdout, y_train, y_val, y_holdout = split_with_dupe_rows_in_train(churn_df)


In [13]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
print("kNN confusion matrix: \n\n", confusion_matrix(y_val, rf.predict(X_val)))

kNN confusion matrix: 

 [[1631    8]
 [  14  347]]


In [14]:
import pickle

In [15]:
with open("rf.pkl", "wb") as f:
    pickle.dump(rf, f)

In [16]:
del rf

In [17]:
with open("rf.pkl", "rb") as f:
    rf_model = pickle.load(f)

In [18]:
X_val.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
6371,0.7,0.73,4,240,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
2487,0.82,0.68,2,285,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
5148,0.55,0.62,5,197,2,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
4279,0.76,0.62,4,197,3,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
9759,0.91,0.77,4,167,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [19]:
X_val.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'IT', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'sales', 'support',
       'technical', 'high', 'low', 'medium'],
      dtype='object')

In [46]:
feature_dict = {'satisfaction_level':.7, 
           'last_evaluation': .7, 
           'number_project': 4,
           'average_montly_hours': 240, 
           'time_spend_company': 2, 
           'Work_accident': 1,
           'promotion_last_5years': 0, 
           'IT': 0, 
           'RandD': 0, 
           'accounting':0, 
           'hr':0,
           'management':0, 
           'marketing':0, 
           'product_mng':0, 
           'sales':0, 
           'support':0,
           'technical':1, 
           'high':0,
           'low':0, 
           'medium':1
            }

In [40]:
feat_dict2 = {'satisfaction_level':.44, 
           'last_evaluation': .5, 
           'number_project': 2,
           'average_montly_hours': 156, 
           'time_spend_company': 3, 
           'Work_accident': 0,
           'promotion_last_5years': 0, 
           'IT': 0, 
           'RandD': 0, 
           'accounting':0, 
           'hr':0,
           'management':0, 
           'marketing':0, 
           'product_mng':0, 
           'sales':1, 
           'support':0,
           'technical':0, 
           'high':0,
           'low':0, 
           'medium':1
            }

In [35]:
pred1 = [.7,.7,4,240,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1]
pred2 = X_val.iloc[5,:]
rf_model.predict_proba(np.array(pred1).reshape(1, -1))
rf_model.predict(np.array(pred1).reshape(1, -1))

array([0])

In [None]:
#intake features as dict
#do a prediction
#return input and result

In [42]:
feature_names = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'IT', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'sales', 'support',
       'technical', 'high', 'low', 'medium']


In [43]:
def make_api_prediction(feature_dict):
    """
    Input:
    feature_dict: a dictionary of the form {"feature_name": "value"}

    Function makes sure the features are fed to the model in the same order the
    model expects them.

    Output:
    Returns a dictionary with the following keys
      all_probs: a list of dictionaries with keys 'name', 'prob'. This tells the
                 probability of class 'name' appearing is the value in 'prob'
      most_likely_class_name: string (name of the most likely class)
      most_likely_class_prob: float (name of the most likely probability)
    """
    x_input = [feature_dict[name] for name in feature_names]
    x_input = [0 if val == '' else float(val) for val in x_input]

    pred_probs = rf_model.predict_proba([x_input]).flat

    probs = [{'name': rf_model.target_names[index], 'prob': pred_probs[index]}
             for index in np.argsort(pred_probs)[::-1]]

    response = {
        'all_probs': probs,
        'most_likely_class_name': probs[0]['name'],
        'most_likely_class_prob': probs[0]['prob'],
    }

    return response

In [55]:
x_input = [feature_dict[name] for name in feature_names]

In [56]:
x_input

[0.7, 0.7, 4, 240, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]

In [57]:
x_input = [0 if val == '' else float(val) for val in x_input]


In [58]:
x_input

[0.7,
 0.7,
 4.0,
 240.0,
 2.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0]

In [65]:
pred_probs = rf_model.predict_proba([x_input]).flat

In [77]:
rf_model.predict([x_input])[0]

0

In [71]:
next(pred_probs)
#.ravel

StopIteration: 

In [66]:
probs = [{'name': lr_model.target_names[index], 'prob': pred_probs[index]}
             for index in np.argsort(pred_probs)[::-1]]

NameError: name 'lr_model' is not defined

In [79]:
for f in feature_names:
    print (f)

satisfaction_level
last_evaluation
number_project
average_montly_hours
time_spend_company
Work_accident
promotion_last_5years
IT
RandD
accounting
hr
management
marketing
product_mng
sales
support
technical
high
low
medium


In [80]:
pred2

satisfaction_level         0.44
last_evaluation            0.50
number_project             2.00
average_montly_hours     156.00
time_spend_company         3.00
Work_accident              0.00
promotion_last_5years      0.00
IT                         0.00
RandD                      0.00
accounting                 0.00
hr                         0.00
management                 0.00
marketing                  0.00
product_mng                0.00
sales                      1.00
support                    0.00
technical                  0.00
high                       0.00
low                        0.00
medium                     1.00
Name: 923, dtype: float64