In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [10]:
X_train_sc_df = pd.read_csv('../Warren/X_train_sc.csv')
X_train_sc_df.drop('customerid', axis=1, inplace=True)
X_train_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,...,paperlessbilling,paymentmethod,single_parent,phone_and_internet,seniorcitizen,tenure,monthlycharges,totalcharges,totchg_per_tenure,monthly_div_tot
0,0,1,1,1,0,0,2,2,0,2,...,0,0,1,1,0,-0.017944,-0.107867,-0.183246,-0.209143,-0.438551
1,0,1,0,1,2,1,2,0,0,2,...,1,1,0,1,0,-0.875263,1.056297,-0.468087,1.547428,-0.274448
2,0,1,1,1,0,2,1,1,1,1,...,0,1,1,0,0,1.165973,-1.501185,-0.428393,-1.431795,-0.502593
3,0,0,0,1,0,1,0,0,0,0,...,1,0,0,1,0,-1.038562,0.174811,-0.781296,0.301796,-0.070922
4,0,0,1,1,0,2,1,1,1,1,...,0,1,0,0,0,0.676076,-1.51122,-0.601398,-1.521564,-0.480899


In [11]:
X_val_sc_df = pd.read_csv('../Warren/X_val_sc.csv')
X_val_sc_df.drop('customerid', axis=1, inplace=True)
X_val_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,...,paperlessbilling,paymentmethod,single_parent,phone_and_internet,seniorcitizen,tenure,monthlycharges,totalcharges,totchg_per_tenure,monthly_div_tot
0,0,1,1,1,0,2,1,1,1,1,...,0,3,1,0,0,0.022881,-1.489476,-0.712634,-1.4713,-0.449101
1,1,1,1,1,2,0,2,0,0,2,...,0,0,1,1,0,-0.344542,0.547812,-0.157005,0.518759,-0.405672
2,0,0,0,1,0,0,2,0,0,2,...,1,2,0,1,0,-1.120212,-0.009181,-0.853459,0.198523,0.097378
3,1,0,0,1,0,1,0,0,0,2,...,0,2,0,1,0,-1.28351,0.313641,-0.976974,0.313909,3.02712
4,0,0,0,0,1,0,2,2,2,2,...,1,0,0,0,0,1.370097,-0.071069,0.823734,-0.062752,-0.502407


In [5]:
y_train_enc_df = pd.read_csv('../Warren/y_train_enc.csv')
y_train_enc_df.head()

Unnamed: 0,churn
0,0
1,1
2,0
3,0
4,0


In [6]:
y_val_enc_df = pd.read_csv('../Warren/y_val_enc.csv')
y_val_enc_df.head()

Unnamed: 0,churn
0,0
1,0
2,0
3,0
4,0


In [7]:
def modeling_function(model, X_train, y_train, X_val, y_val):

    # fit model on training data
    model.fit(X_train, y_train)

    # make predictions on training and validation data
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    # Print accuracy score
    print('Training Precision: ', precision_score(y_train, train_preds))
    print('Validation Precision: ', precision_score(y_val, val_preds))
    print('Training F1: ', f1_score(y_train, train_preds))
    print('Validation F1: ', f1_score(y_val, val_preds))
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Validation Accuracy: ', accuracy_score(y_val, val_preds))
    # return fitted model
    return model

In [15]:
logreg = LogisticRegression(random_state=10)

In [16]:
modeling_function(logreg, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6840882694541232
Validation Precision:  0.66996699669967
Training F1:  0.6145018257694315
Validation F1:  0.6217457886676875
Training Accuracy:  0.8134309517798536
Validation Accuracy:  0.8130204390613172


LogisticRegression(random_state=10)

In [17]:
logreg_tune = LogisticRegression(penalty= 'l1', C=0.2, solver='liblinear', random_state=10)

In [18]:
modeling_function(logreg_tune, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6923076923076923
Validation Precision:  0.67
Training F1:  0.6206896551724138
Validation F1:  0.6184615384615385
Training Accuracy:  0.8167129512749306
Validation Accuracy:  0.8122634367903103


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

In [19]:
sm = SMOTE(random_state=10)

In [20]:
X_train_res, y_train_res = sm.fit_resample(X_train_sc_df, y_train_enc_df)

In [25]:
y_train_enc_df.value_counts(normalize=True)

churn
0        0.733401
1        0.266599
dtype: float64

In [26]:
y_train_res.value_counts(normalize=True)

churn
1        0.5
0        0.5
dtype: float64

In [21]:
modeling_function(logreg_tune, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.7742238946378175
Validation Precision:  0.5143953934740882
Training F1:  0.8103052182474565
Validation F1:  0.6153846153846153
Training Accuracy:  0.8010327022375215
Validation Accuracy:  0.7464042392127176


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

In [30]:
logreg_tune_coef = pd.DataFrame(np.abs(logreg_tune.coef_.T), columns=['coefficient'], index=X_train_res.columns)

In [41]:
logreg_tune_coef

Unnamed: 0,coefficient
gender,0.427574
partner,0.267539
dependents,0.687879
phoneservice,0.0
multiplelines,0.105697
internetservice,0.025147
onlinesecurity,0.35101
onlinebackup,0.267458
deviceprotection,0.108715
techsupport,0.273795


In [56]:
#weights = {'gender':1.0, 'partner':1.0, 'dependents':1.0, 'phoneservice':0.0, 'multiplelines':1.0, 'internetservice':1.0, 'onlinesecurity':1.0, 'deviceprotection':1.0, 'techsupport':1.0, 10:1.0, 11:1.0, 12:1.0, 13:1.0, 14:0.0, 15:1.0, 16:0.0 ,17:2.0, 18:1.0, 19:1.0, 20:2.0, 21:1.0, 22:1.0, 23:1.0}

In [64]:
weights = {0:1.0, 1:3.0}

In [65]:
param_grid = {'penalty': ['l1', 'l2'],
              'C': [.001, .01, .1, 1],
              'class_weight': [None, weights],
              'max_iter': [10, 50, 100, 200]}

In [66]:
gs = GridSearchCV(logreg, param_grid, scoring='precision')

In [67]:
gs.fit(X_train_res, y_train_res)

GridSearchCV(estimator=LogisticRegression(random_state=10),
             param_grid={'C': [0.001, 0.01, 0.1, 1],
                         'class_weight': [None, {0: 1.0, 1: 3.0}],
                         'max_iter': [10, 50, 100, 200],
                         'penalty': ['l1', 'l2']},
             scoring='precision')

In [69]:
gs.best_params_

{'C': 1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2'}

In [53]:
modeling_function(best_logreg, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.7769423558897243
Validation Precision:  0.5144508670520231
Training F1:  0.8135148433655897
Validation F1:  0.6144994246260069
Training Accuracy:  0.804302925989673
Validation Accuracy:  0.7464042392127176


LogisticRegression(C=1, random_state=10)

In [62]:
logreg_weights = LogisticRegression(random_state=10, class_weight=weights, C=0.2)

In [63]:
modeling_function(logreg_weights, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.7754782063342741
Validation Precision:  0.5114942528735632
Training F1:  0.8116179849031835
Validation F1:  0.6123853211009174
Training Accuracy:  0.8024096385542169
Validation Accuracy:  0.7441332323996972


LogisticRegression(C=0.2, class_weight={0: 2.0, 1: 2.0}, random_state=10)

In [35]:
import xgboost as xgb

In [37]:
boost = xgb.XGBClassifier()

In [39]:
modeling_function(boost, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.9613899613899614
Validation Precision:  0.631578947368421
Training F1:  0.9521988527724665
Validation F1:  0.5871559633027522
Training Accuracy:  0.9747538500378692
Validation Accuracy:  0.7956093868281605


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)