In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [211]:
X_train_sc_df = pd.read_csv('../Warren/X_train_sc.csv')
X_train_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,seniorcitizen,tenure,monthlycharges,totalcharges
0,0,1,1,1,0,0,2,2,0,2,0,0,0,0,0,-0.427069,-0.017944,-0.107867,-0.183246
1,0,1,0,1,2,1,2,0,0,2,2,0,0,1,1,-0.427069,-0.875263,1.056297,-0.468087
2,0,1,1,1,0,2,1,1,1,1,1,1,2,0,1,-0.427069,1.165973,-1.501185,-0.428393
3,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,-0.427069,-1.038562,0.174811,-0.781296
4,0,0,1,1,0,2,1,1,1,1,1,1,1,0,1,-0.427069,0.676076,-1.51122,-0.601398


In [None]:
X_train_sc_df.corr()

In [5]:
X_val_sc_df = pd.read_csv('../Warren/X_val_sc.csv')
X_val_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,seniorcitizen,tenure,monthlycharges,totalcharges
0,0,1,1,1,0,2,1,1,1,1,1,1,2,0,3,-0.427069,0.022881,-1.489476,-0.712634
1,1,1,1,1,2,0,2,0,0,2,2,2,0,0,0,-0.427069,-0.344542,0.547812,-0.157005
2,0,0,0,1,0,0,2,0,0,2,2,0,0,1,2,-0.427069,-1.120212,-0.009181,-0.853459
3,1,0,0,1,0,1,0,0,0,2,0,0,0,0,2,-0.427069,-1.28351,0.313641,-0.976974
4,0,0,0,0,1,0,2,2,2,2,2,2,2,1,0,-0.427069,1.370097,-0.071069,0.823734


In [6]:
y_train_enc_df = pd.read_csv('../Warren/y_train_enc.csv')
y_train_enc_df.head()

Unnamed: 0,churn
0,0
1,1
2,0
3,0
4,0


In [7]:
y_val_enc_df = pd.read_csv('../Warren/y_val_enc.csv')
y_val_enc_df.head()

Unnamed: 0,churn
0,0
1,0
2,0
3,0
4,0


In [69]:
logreg = LogisticRegression(penalty= 'l1', C=0.2, solver='liblinear', random_state=10)
logreg.fit(X_train_sc_df, y_train_enc_df)
train_preds = logreg.predict(X_train_sc_df)
val_preds = logreg.predict(X_val_sc_df)
print('Training Precision: ', precision_score(y_train_enc_df, train_preds))
print('Validation Precision: ', precision_score(y_val_enc_df, val_preds))
print('Training F1: ', f1_score(y_train_enc_df, train_preds))
print('Validation F1: ', f1_score(y_val_enc_df, val_preds))

Training Precision:  0.6625560538116592
Validation Precision:  0.6355140186915887
Training F1:  0.606776180698152
Validation F1:  0.6080476900149031


  return f(**kwargs)


In [139]:
def modeling_function(model, X_train, y_train, X_val, y_val):

    # fit model on training data
    model.fit(X_train, y_train)

    # make predictions on training and validation data
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    # Print accuracy score
    print('Training Precision: ', precision_score(y_train, train_preds))
    print('Validation Precision: ', precision_score(y_val, val_preds))
    print('Training F1: ', f1_score(y_train, train_preds))
    print('Validation F1: ', f1_score(y_val, val_preds))
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Validation Accuracy: ', accuracy_score(y_val, val_preds))
    # return fitted model
    return model

In [34]:
modeling_function(logreg, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6625560538116592
Validation Precision:  0.6355140186915887
Training F1:  0.606776180698152
Validation F1:  0.6080476900149031


  return f(**kwargs)


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

In [35]:
sm = SMOTE(random_state=10)

In [36]:
X_train_res, y_train_res = sm.fit_resample(X_train_sc_df, y_train_enc_df)

In [37]:
y_train_res.value_counts()

churn
1        2905
0        2905
dtype: int64

In [38]:
modeling_function(logreg, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.7709779179810725
Validation Precision:  0.5027726432532348
Training F1:  0.8046090534979423
Validation F1:  0.6105499438832772


  return f(**kwargs)


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

In [57]:
#ridge_model = Ridge(random_state=10)
#ridge_model.fit(X_train_sc_df, y_train_enc_df)
#train_preds_ridge = ridge_model.predict(X_train_sc_df)
#val_preds_ridge = ridge_model.predict(X_val_sc_df)
#print('Training Precision: ', precision_score(y_train_enc_df, train_preds_ridge_sm))
#print('Validation Precision: ', precision_score(y_val_enc_df, val_preds_ridge_sm))
#print('Training F1: ', f1_score(y_train_enc_df, train_preds_ridge))
#print('Validation F1: ', f1_score(y_val_enc_df, val_preds_ridge))

In [56]:
#train_preds_ridge_sm = ridge_model.predict(X_train_res)
#val_preds_ridge_sm = ridge_model.predict(X_val_sc_df)
#print('Training Precision: ', precision_score(y_train_res, train_preds_ridge_sm))
#print('Validation Precision: ', precision_score(y_val_enc_df, val_preds_ridge_sm))
#print('Training F1: ', f1_score(y_train_res, train_preds_ridge_sm))
#print('Validation F1: ', f1_score(y_val_enc_df, val_preds_ridge_sm))

In [58]:
#modeling_function(ridge_model, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

In [55]:
knn = KNeighborsClassifier()

In [59]:
modeling_function(knn, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

  model.fit(X_train, y_train)


Training Precision:  0.8140082401412596
Validation Precision:  0.4551971326164875
Training F1:  0.877677296525464
Validation F1:  0.5594713656387665


KNeighborsClassifier()

In [60]:
modeling_function(knn, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

  model.fit(X_train, y_train)


Training Precision:  0.6995753715498938
Validation Precision:  0.5714285714285714
Training F1:  0.6596596596596596
Validation F1:  0.5476190476190477


KNeighborsClassifier()

In [61]:
tree = DecisionTreeClassifier(random_state=10)

In [62]:
modeling_function(tree, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.9996549344375432
Validation Precision:  0.46064814814814814
Training F1:  0.9984490780630708
Validation F1:  0.5089514066496164


DecisionTreeClassifier(random_state=10)

In [63]:
modeling_function(tree, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  1.0
Validation Precision:  0.49441340782122906
Training F1:  0.9957203994293866
Validation F1:  0.5


DecisionTreeClassifier(random_state=10)

In [64]:
forest = RandomForestClassifier(random_state=10)

In [65]:
modeling_function(forest, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

  model.fit(X_train, y_train)


Training Precision:  0.9975945017182131
Validation Precision:  0.5754475703324808
Training F1:  0.9984522785898539
Validation F1:  0.6072874493927126


RandomForestClassifier(random_state=10)

In [66]:
modeling_function(forest, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

  model.fit(X_train, y_train)


Training Precision:  0.9943342776203966
Validation Precision:  0.6630824372759857
Training F1:  0.9957446808510638
Validation F1:  0.5882352941176471


RandomForestClassifier(random_state=10)

In [70]:
param_grid_knn = {'n_neighbors': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                  'leaf_size': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                  'metric': ['minkowski', 'euclidean', 'manhattan']}

In [71]:
gs_knn = GridSearchCV(knn, param_grid_knn, scoring='precision')

In [78]:
gs_knn.fit(X_train_res, y_train_res)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45,
                                       50],
                         'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45,
                                         50]},
             scoring='precision')

In [79]:
gs_knn.best_params_

{'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 3}

In [81]:
best_knn_sm = gs_knn.best_estimator_

In [82]:
modeling_function(best_knn_sm, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.8607867240319607
Validation Precision:  0.4749034749034749
Training F1:  0.9095632407858418
Validation F1:  0.5668202764976958


KNeighborsClassifier(leaf_size=10, metric='manhattan', n_neighbors=3)

In [84]:
gs_knn_no_sm = gs_knn.fit(X_train_sc_df, y_train_enc_df)

In [75]:
gs_knn_no_sm.best_params_

{'leaf_size': 3, 'metric': 'manhattan', 'n_neighbors': 20}

In [76]:
best_knn = gs_knn.best_estimator_

In [77]:
modeling_function(best_knn, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6662983425414365
Validation Precision:  0.6209150326797386
Training F1:  0.6149923508414075
Validation F1:  0.5792682926829268


KNeighborsClassifier(leaf_size=3, metric='manhattan', n_neighbors=20)

In [85]:
param_grid_forest = {'n_estimators': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                     'criterion': ['gini', 'entropy'],
                     'max_depth': [3, 5, 8, 10, 15, 20],
                     'min_samples_split': [2, 4, 6, 8, 10],
                     'min_samples_leaf': [2, 4, 6, 8, 10]}

In [86]:
gs_forest = GridSearchCV(forest, param_grid_forest, scoring='precision')

In [87]:
gs_forest.fit(X_train_res, y_train_res)

In [88]:
gs_forest.best_params_

{'criterion': 'gini',
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 30}

In [89]:
best_random = gs_forest.best_estimator_

In [90]:
modeling_function(best_random, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.9241086032057573
Validation Precision:  0.5606407322654462
Training F1:  0.9476685675947668
Validation F1:  0.6226175349428209


RandomForestClassifier(max_depth=15, min_samples_leaf=2, n_estimators=30,
                       random_state=10)

In [91]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [92]:
import xgboost as xgb

In [207]:
 boost_model = xgb.XGBClassifier(early_stopping_rounds=10, eta=.1, gamma=.1, alpha=1, max_leaves=10)

In [208]:
modeling_function(boost_model, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Training Precision:  0.8783697047496791
Validation Precision:  0.5652173913043478
Training F1:  0.9091513037701378
Validation F1:  0.6277001270648029
Training Accuracy:  0.9058519793459553
Validation Accuracy:  0.7781983345950038


XGBClassifier(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=10,
              eta=0.1, gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=6, max_leaves=10, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, random_state=0, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [202]:
modeling_function(boost_model, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Training Precision:  0.8458149779735683
Validation Precision:  0.6411960132890365
Training F1:  0.7820773930753563
Validation F1:  0.5929339477726574
Training Accuracy:  0.8919464781620803
Validation Accuracy:  0.7993943981831946


XGBClassifier(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=10,
              eta=0.1, gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=6, max_leaves=6, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, random_state=0, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [105]:
logreg.coef_

array([[-0.09542229,  0.        , -0.20908884, -1.00181051,  0.10036993,
         0.22760317, -0.27529663, -0.1832553 , -0.10676363, -0.24078737,
         0.00694504,  0.        , -0.76995869,  0.26026604,  0.04381296,
         0.10776281, -1.13071954,  0.73786007,  0.36940274]])

In [107]:
logreg_coefs = pd.DataFrame(np.abs(logreg.coef_.T), columns=['coefficient'], index=X_train_sc_df.columns)
logreg_coefs.sort_values('coefficient')

Unnamed: 0,coefficient
partner,0.0
streamingmovies,0.0
streamingtv,0.006945
paymentmethod,0.043813
gender,0.095422
multiplelines,0.10037
deviceprotection,0.106764
seniorcitizen,0.107763
onlinebackup,0.183255
dependents,0.209089


In [212]:
boost_params = {'eta': [.01, .1, 1, 2, 5],
                'gamma': [.01, .1, 1, 2, 5],
                'alpha': [.01, .1, 1, 2, 5],
                'max_leaves': [2, 5, 10, 15, 20],
                'max_depth': [1, 5, 10, 15, 20, 25]}

In [213]:
gs_boost = GridSearchCV(boost_model, boost_params, scoring='precision')