In [51]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import show
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline 
from sklearn import preprocessing
import pickle

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    


def plot_single_var(df, var):
    sns.set(style="darkgrid")

    total = float(len(df)) # one person per row 
    #ax = sns.barplot(x="class", hue="who", data=titanic)
    ax = sns.countplot(x=var, data=df, order = df[var].value_counts().index) 
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{0:.0%}'.format(height/total),
                ha="center") 
    show()
    
def plot_category_compare(var,group,title,df):
    flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e","#708090", "#FFC0CB","#C71585", 
          "#7B68EE",'#4169E1','#6495ED',]
    # mpl.style.use('seaborn')
    # with sns.color_palette("husl", 8):
    #     ax = tb.plot(x = tb.index, kind='barh',stacked = True, title = title, mark_right = True)
    tb = pd.crosstab(index=df[var],  columns=[ df[group]], normalize='index')
    ax = tb.plot(x = tb.index, kind='barh',stacked = True, mark_right = True, color = flatui[:tb.shape[1]], title = title)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    total = float(len(tb)) # one person per row 
    size = tb.shape[0]
    cnt = 0
    for p in ax.patches:
        height = 0
        res =  cnt % size
        ax.text(p.get_x()+p.get_width()/2.,height + res,
                    '{0:.0%}'.format(p.get_width()),
                    ha="center")
        cnt += 1
    show()
    
from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
from sklearn.preprocessing import scale, StandardScaler, Imputer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix
from sklearn.model_selection import KFold # import KFold
def cross_val(cols,model):
    X_copy = X_train.reset_index()
    y_copy = y_train.reset_index()
    X_copy = X_copy.drop('index', axis = 1)
    y_copy = y_copy.drop('index', axis = 1)
    #df_y = np.where(df_ext['Performance'] == 'Good', 1,0)
    kf = KFold(n_splits=5,random_state=1234, shuffle=True) # Define the split - into 10 folds 
    kf.get_n_splits(X_copy) # returns the number of splitting iterations in the cross-validator
    
    auc = []
    pr_auc = []
    for train_index, test_index in kf.split(X_copy):
        X_tr, X_t = X_copy.loc[train_index, cols], X_copy.loc[test_index, cols]
        y_tr, y_t = y_copy.iloc[train_index,:], y_copy.iloc[test_index,:]
        model.fit(X_tr, y_tr.values.ravel()) 
        y_pred = model.predict_proba(X_t)[:,1]
        fpr, tpr, thresholds = metrics.roc_curve(y_t,y_pred)
        precision, recall, thresholds = metrics.precision_recall_curve(y_t,y_pred)
        auc.append(metrics.auc(fpr, tpr))
        pr_auc.append(metrics.auc(recall, precision, reorder=True))
    return [np.mean(auc),np.mean(pr_auc)]

def pred_model_res(X_train, y_train,X_test, y_test, model):
    model = model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test,y_pred)
    return [metrics.auc(fpr, tpr), metrics.auc(recall, precision, reorder=True)]

def search_model(x_train, y_train, est, param_grid, n_jobs = -1, cv = 5, refit=False):
##Grid Search for the best model
    model = GridSearchCV(estimator = est,
                         param_grid = param_grid,
                         scoring = 'roc_auc',
                         verbose = 50,
                         n_jobs = n_jobs,
                         iid = True,
                         refit = refit,
                         cv = cv)
    # Fit Grid Search Model
    model.fit(x_train, y_train)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    return model
    
    
def cross_val_select(X_train_trans, y_train_trans, model):
    kf = KFold(n_splits=10,random_state=1234, shuffle=True) # Define the split - into 10 folds 
    kf.get_n_splits(X_train_trans) # returns the number of splitting iterations in the cross-validator
    auc = []
    pr_auc = []
    for train_index, test_index in kf.split(X_train_trans):
        X_tr, X_t = X_train_trans[train_index], X_train_trans[test_index]
        y_tr, y_t = y_train.values[train_index], y_train.values[test_index]
        model.fit(X_tr, y_tr) 
        y_pred = model.predict_proba(X_t)[:,1]
        fpr, tpr, thresholds = metrics.roc_curve(y_t,y_pred)
        precision, recall, thresholds = metrics.precision_recall_curve(y_t,y_pred)
        auc.append(metrics.auc(fpr, tpr))
        pr_auc.append(metrics.auc(recall, precision, reorder=True))
    return [np.mean(auc),np.mean(pr_auc)]


def eeo_analysis(X_val, y_pred, cols):
    X_val_eeo = X_val.reset_index()
    X_val_eeo = pd.concat([X_val_eeo,pd.DataFrame(y_pred,columns=['y_pred'])], axis =1)
    mean = X_val_eeo.groupby('EEOC_CODE')['y_pred'].mean()
    std = X_val_eeo.groupby('EEOC_CODE')['y_pred'].std()/np.sqrt(X_val_eeo['EEOC_CODE'].value_counts().sort_index())
    plt.errorbar(mean.index, mean,xerr=0.5, yerr=2*std, linestyle='')
    plt.show()
    
    race1 = X_val_eeo[X_val_eeo['EEOC_CODE'] == 1] 
    race2 = X_val_eeo[X_val_eeo['EEOC_CODE'] == 2] 
    # race 1
    regr = linear_model.LinearRegression()
    # Train the model using the training sets
    regr.fit(race1[cols], race1['y_pred'])
    race1_coef = regr.coef_
    race1_intercept = regr.intercept_ 

    # race 2
    regr = linear_model.LinearRegression()
    # Train the model using the training sets
    regr.fit(race2[cols], race2['y_pred'])
    race2_coef = regr.coef_
    race2_intercept = regr.intercept_ 

    avg_performance_race1 = race1['y_pred'].mean()
    avg_performance_race2 = race2['y_pred'].mean()
    tb = pd.DataFrame(((race1[cols].mean() - race2[cols].mean()) * race1_coef).sort_values(ascending = False), columns = ['diff'])
    avg_diff = round(avg_performance_race1 - avg_performance_race2,4)
    avg_diff = pd.DataFrame([avg_diff], columns = ['diff'])
    tb = avg_diff.append(tb)
    return tb

In [4]:
data = pd.read_csv('data.csv', low_memory=False)

## Modeling

In [24]:
# Train/ Test dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop(['performance'], axis = 1) , df['performance'], 
                                                    test_size=0.2, random_state=1234, stratify = df['performance'])


In [26]:
# Logistic regression for diff cols
auc_cv = []
prauc_cv = []
auc_test = []
prauc_test = []
for i in range(len(cols)):
    log_model = linear_model.LogisticRegression(random_state = 1234, class_weight='balanced')
    auc_cv_temp, prauc_cv_temp = cross_val(cols[i], log_model)
    auc_cv.append(auc_cv_temp)
    prauc_cv.append(prauc_cv_temp)
    auc_test_temp, prauc_test_temp = pred_model_res(X_train[cols[i]], y_train, X_test[cols[i]], y_test,log_model)
    auc_test.append(auc_test_temp)
    prauc_test.append(prauc_test_temp)

In [None]:
print(prauc_cv)
print(prauc_test)

In [29]:
x_axis = ['1.Orig Var','2.Log Var', '3.Min-Max Var','4.Categorical Var','5.Scale Var']
plt.plot(x_axis,auc_cv)
plt.plot(x_axis,auc_test)
plt.legend(['CV AUC', 'Test AUC'])
plt.title('Customer - K Selection for AUC Performance')

In [33]:
# K selection for cat
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
auc_cv_k = []
prauc_cv_k = []
auc_test_k = []
prauc_test_k = []
for i in range(30,X_train[cols[3]].shape[1] + 1):
    log_model = linear_model.LogisticRegression(random_state = 1234, class_weight='balanced')
    var_select = SelectKBest(score_func=chi2, k=i)
    fit = var_select.fit(X_train[cols[3]], y_train)
    X_train_trans = fit.transform(X_train[cols[3]])
    X_test_trans = fit.transform(X_test[cols[3]])
    auc_cv_temp, prauc_cv_temp = cross_val_select(X_train_trans, y_train, log_model)
    auc_cv_k.append(auc_cv_temp)
    prauc_cv_k.append(prauc_cv_temp)
    auc_test_temp, prauc_test_temp = pred_model_res(X_train_trans, y_train, X_test_trans, y_test, log_model)
    auc_test_k.append(auc_test_temp)
    prauc_test_k.append(prauc_test_temp)

In [None]:
plt.plot([x for x in range(30, len(cols[3])+1)],auc_cv_k)
plt.plot([x for x in range(30, len(cols[3])+1)],auc_test_k)
plt.legend(['CV AUC', 'Test AUC'])
plt.title('customer - Categorized - K Selection for AUC Performance')

In [None]:
k = np.argmax(auc_cv_k) + 30 + 1
best_k = featureScores.sort_values('Score', ascending=False).reset_index(drop=True)[:k]['Var'].values

In [41]:
var_select = SelectKBest(score_func=chi2, k=k)
fit = var_select.fit(X_train[cols[3]], y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train[cols[3]].columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Var','Score']  #naming the dataframe columns
featureScores.to_csv('log_cat_K_select_79k.csv')

In [None]:
log_model = linear_model.LogisticRegression(random_state = 1234, class_weight='balanced')
log_model = log_model.fit(X_train[best_k],y_train)
y_pred = log_model.predict_proba(X_test[best_k])[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
print('AUC: ', metrics.auc(fpr, tpr))
precision, recall, thresholds = metrics.precision_recall_curve(y_test,y_pred)
print('PR-AUC: ', metrics.auc(recall, precision, reorder=True))

In [120]:
## save log_cat model
filename = 'logistic_model.pkl'
pickle.dump(log_model, open(filename, 'wb'))

#### Random Forest

##### Parameter Tune

Categorical Variable

In [None]:
np.random.seed(1234)
param_grid = {
               #'min_samples_leaf' : [50,56,57,58,59,60,61,62,63,64,65] #[95,97,98,99,100,111,112,113,114,115]#
               #'n_estimators':[10, 150, 200, 300, 500, 1000],
               #'criterion':['gini', 'entropy'],
               'max_depth': [1,2,3,4,5,6,7,8,9,10,15,20],
               #'max_features': ["auto", "sqrt", "log2", None],
               #'min_samples_split': [0.001,0.002,0.003,0.004]
}
model_rf = search_model(X_train[cols[3]], y_train, est = RandomForestClassifier(
    n_estimators = 500, 
    #criterion = 'entropy',
    #max_depth = 7, 
    #min_samples_leaf = 57,
    #max_features = 'sqrt', 
    #min_samples_split = 0.003,
    class_weight="balanced"), param_grid = param_grid)

In [None]:
rf = RandomForestClassifier(
    n_estimators = 500, 
     criterion = 'entropy',
      max_depth = 5, 
      min_samples_leaf = 100,
      max_features = None, 
      min_samples_split = 0.05,
class_weight="balanced")


auc_cv_temp, prauc_cv_temp = cross_val(cols[3], rf)
auc_cv.append(auc_cv_temp)
prauc_cv.append(prauc_cv_temp)
auc_test_temp, prauc_test_temp = pred_model_res(X_train[cols[3]], y_train, X_test[cols[3]], y_test,rf)
auc_test.append(auc_test_temp)
prauc_test.append(prauc_test_temp)
print(auc_cv_temp, prauc_cv_temp, auc_test_temp, prauc_test_temp)

In [27]:
## save model
filename = 'randomforest_cat.pkl'
pickle.dump(rf, open(filename, 'wb'))

#### Xgboost

In [17]:
import xgboost as xgb

xgb.__version__

'0.81'

In [74]:
from bayes_opt import BayesianOptimization
import xgboost as xgb

xgtrain = xgb.DMatrix(X_train[cols[3]], label=y_train.values.reshape(X_train[cols[3]].shape[0], 1))


def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma):
    params = dict()
    params['objective'] = 'binary:logistic'
#     params['num_class'] = 3
    params['eta'] = 0.01
    params['max_depth'] = int(max_depth)
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['gamma'] = gamma
    params['verbose_eval'] = True

    cv_results = xgb.cv(params, xgb.DMatrix(X_train[cols[3]], label=y_train.values.reshape(X_train[cols[3]].shape[0], 1), missing=-1),
                        num_boost_round=100000,
                        nfold=5,
                        metrics={'auc'},
                        maximize=True,
                        stratified=True,
                        shuffle=True,
                        seed=1234,
                        early_stopping_rounds=50)

    return cv_results['test-auc-mean'].max()


In [None]:
xgb_BO = BayesianOptimization(xgb_evaluate,
                              {'max_depth': (6, 20),#(5, 6, 7, 8, 9,10, 11,12,13,14, 15,20),#(6, 15),
                               'min_child_weight': (0, 30),#( 1, 3, 5, 7,10,15,20,25,30,35,40,45,50 ),#(0, 10),
                               'colsample_bytree': (0.1, 1),
                               'subsample': (0.7, 1),
                               'gamma': (0, 2)
                               }
                              )

xgb_BO.maximize(init_points=5, n_iter=40)

In [None]:
xgb_BO_scores = pd.DataFrame(xgb_BO.res['all']['params'])
xgb_BO_scores['score'] = pd.DataFrame(xgb_BO.res['all']['values'])
xgb_BO_scores = xgb_BO_scores.sort_values(by='score',ascending=False)
xgb_BO_scores.head()

In [None]:
# train model with bayes optimation
import xgboost as xgb
xgb_params = {
    'max_depth': 6,
    'min_child_weight': 29.503411,
    'subsample': 0.941049,
    'colsample_bytree': 0.314091,
    'gamma': 1.994352,
    'objective': 'binary:logistic',
    'eta': 0.01,
    'seed': 1234}

print(xgb_params)

cv_results = xgb.cv(xgb_params, xgb.DMatrix(X_train[cols[3]], label=y_train),
                    num_boost_round=1000000,
                    nfold=5,
                    maximize=True, 
                    stratified=True,
                    shuffle=True,
                    verbose_eval=20,
                    seed=1234,
                    early_stopping_rounds=100,
                    metrics={'auc'})

In [None]:
cv_results['test-auc-mean']
best_xgb_score = cv_results['test-auc-mean'].max()
best_xgb_iteration = len(cv_results)

print('best score:', best_xgb_score, 'best iterations:', best_xgb_iteration)  

In [None]:
model = xgb.train(xgb_params, 
                  xgb.DMatrix(X_train[cols[3]], label=y_train),
                  num_boost_round=best_xgb_iteration
                 )

y_pred = model.predict(xgb.DMatrix(X_test[cols[3]]))

fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
print('AUC: ', metrics.auc(fpr, tpr))
precision, recall, thresholds = metrics.precision_recall_curve(y_test,y_pred)
print('PR-AUC: ', metrics.auc(recall, precision, reorder=True))
df_cat = pd.concat([df_cat, pd.DataFrame(y_pred, columns=['xgb_cat']).reset_index(drop=True)],axis = 1)

In [None]:
X_copy = X_train.reset_index()
y_copy = y_train.reset_index()
X_copy = X_copy.drop('index', axis = 1)
y_copy = y_copy.drop('index', axis = 1)
#df_y = np.where(df_ext['Performance'] == 'Good', 1,0)
kf = KFold(n_splits=5,random_state=1234, shuffle=True) # Define the split - into 10 folds 
kf.get_n_splits(X_copy) # returns the number of splitting iterations in the cross-validator

auc = []
pr_auc = []
for train_index, test_index in kf.split(X_copy):
    X_tr, X_t = X_copy.loc[train_index, cols[3]], X_copy.loc[test_index, cols[3]]
    y_tr, y_t = y_copy.iloc[train_index,:], y_copy.iloc[test_index,:]
    model = xgb.train(xgb_params, 
                  xgb.DMatrix(X_tr, label=y_tr.values.ravel()),
                  num_boost_round=best_xgb_iteration
                 )
    #model.fit(X_tr, y_tr.values.ravel()) 
    #y_pred = model.predict_proba(X_t)[:,1]
    y_pred = model.predict(xgb.DMatrix(X_t))
    fpr, tpr, thresholds = metrics.roc_curve(y_t,y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(y_t,y_pred)
    auc.append(metrics.auc(fpr, tpr))
    pr_auc.append(metrics.auc(recall, precision, reorder=True))
np.mean(auc)