# Notebook for Creating the Best Models With all the Features in the DF

Importing packages and bringing in the data.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
X_train_sc_df = pd.read_csv('../Warren/X_train_sc.csv')
X_train_sc_df.drop('customerid', axis=1, inplace=True)
X_train_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,...,single_parent,phone_and_internet,seniorcitizen,tenure,monthlycharges,totalcharges,totchg_per_tenure,monthly_div_tot,num_of_internet_services,num_of_services
0,0,1,1,1,0,0,2,2,0,2,...,1,1,0,-0.017944,-0.107867,-0.183246,-0.209143,-0.438551,0.854371,0.531661
1,0,1,0,1,2,1,2,0,0,2,...,0,1,0,-0.875263,1.056297,-0.468087,1.547428,-0.274448,0.854371,0.531661
2,0,1,1,1,0,2,1,1,1,1,...,1,0,0,1.165973,-1.501185,-0.428393,-1.431795,-0.502593,-1.095781,-1.107179
3,0,0,0,1,0,1,0,0,0,0,...,0,1,0,-1.038562,0.174811,-0.781296,0.301796,-0.070922,-1.095781,-1.107179
4,0,0,1,1,0,2,1,1,1,1,...,0,0,0,0.676076,-1.51122,-0.601398,-1.521564,-0.480899,-1.095781,-1.107179


In [5]:
X_val_sc_df = pd.read_csv('../Warren/X_val_sc.csv')
X_val_sc_df.drop('customerid', axis=1, inplace=True)
X_val_sc_df.head()

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,...,single_parent,phone_and_internet,seniorcitizen,tenure,monthlycharges,totalcharges,totchg_per_tenure,monthly_div_tot,num_of_internet_services,num_of_services
0,0,1,1,1,0,2,1,1,1,1,...,1,0,0,0.022881,-1.489476,-0.712634,-1.4713,-0.449101,-1.095781,-1.107179
1,1,1,1,1,2,0,2,0,0,2,...,1,1,0,-0.344542,0.547812,-0.157005,0.518759,-0.405672,1.504422,1.077941
2,0,0,0,1,0,0,2,0,0,2,...,0,1,0,-1.120212,-0.009181,-0.853459,0.198523,0.097378,0.854371,0.531661
3,1,0,0,1,0,1,0,0,0,2,...,0,1,0,-1.28351,0.313641,-0.976974,0.313909,3.02712,-0.44573,-0.560899
4,0,0,0,0,1,0,2,2,2,2,...,0,0,0,1.370097,-0.071069,0.823734,-0.062752,-0.502407,2.154473,2.170501


In [6]:
y_train_enc_df = pd.read_csv('../Warren/y_train_enc.csv')
y_train_enc_df.head()

Unnamed: 0,churn
0,0
1,1
2,0
3,0
4,0


In [7]:
y_val_enc_df = pd.read_csv('../Warren/y_val_enc.csv')
y_val_enc_df.head()

Unnamed: 0,churn
0,0
1,0
2,0
3,0
4,0


# Modeling

Creating a function to fit, predict, and return scores for my models. 

In [8]:
def modeling_function(model, X_train, y_train, X_val, y_val):

    # fit model on training data
    model.fit(X_train, y_train)

    # make predictions on training and validation data
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    # Print accuracy score
    print('Training Precision: ', precision_score(y_train, train_preds))
    print('Validation Precision: ', precision_score(y_val, val_preds))
    print('Training F1: ', f1_score(y_train, train_preds))
    print('Validation F1: ', f1_score(y_val, val_preds))
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Validation Accuracy: ', accuracy_score(y_val, val_preds))
    # return fitted model
    return model

### Initial Logistic Regression Model

In [9]:
logreg = LogisticRegression(random_state=10)

In [10]:
modeling_function(logreg, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6840882694541232
Validation Precision:  0.6666666666666666
Training F1:  0.6145018257694315
Validation F1:  0.6186830015313934
Training Accuracy:  0.8134309517798536
Validation Accuracy:  0.8115064345193036


LogisticRegression(random_state=10)

#### Tuning the Logistic Regression

In [11]:
logreg_tune = LogisticRegression(penalty= 'l1', C=0.2, solver='liblinear', random_state=10)

In [12]:
modeling_function(logreg_tune, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.6915017462165308
Validation Precision:  0.67
Training F1:  0.6203655352480417
Validation F1:  0.6184615384615385
Training Accuracy:  0.8164604897753093
Validation Accuracy:  0.8122634367903103


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

#### Smoting the data to see if it helped the scores (It made the scores worse for validation)

In [13]:
sm = SMOTE(random_state=10)

In [14]:
X_train_res, y_train_res = sm.fit_resample(X_train_sc_df, y_train_enc_df)

In [15]:
y_train_enc_df.value_counts(normalize=True)

churn
0        0.733401
1        0.266599
dtype: float64

In [16]:
y_train_res.value_counts(normalize=True)

churn
1        0.5
0        0.5
dtype: float64

In [17]:
modeling_function(logreg_tune, X_train_res, y_train_res, X_val_sc_df, y_val_enc_df)

Training Precision:  0.7737386399247884
Validation Precision:  0.5134099616858238
Training F1:  0.81003937007874
Validation F1:  0.6146788990825688
Training Accuracy:  0.8006884681583477
Validation Accuracy:  0.7456472369417109


LogisticRegression(C=0.2, penalty='l1', random_state=10, solver='liblinear')

In [18]:
logreg_tune_coef = pd.DataFrame(np.abs(logreg_tune.coef_.T), columns=['coefficient'], index=X_train_res.columns)

In [19]:
logreg_tune_coef

Unnamed: 0,coefficient
gender,0.419721
partner,0.266899
dependents,0.697416
phoneservice,0.0
multiplelines,0.111374
internetservice,0.029988
onlinesecurity,0.344061
onlinebackup,0.253006
deviceprotection,0.110095
techsupport,0.243313


#### Running a Grid Search for the Logistic Regression model

In [20]:
weights = {0:1.0, 1:1.0}

In [21]:
param_grid = {'penalty': ['l1', 'l2'],
              'C': [.001, .01, .1, 1],
              'class_weight': [None, weights],
              'max_iter': [10, 50, 100, 200]}

In [22]:
gs = GridSearchCV(logreg, param_grid, scoring='precision')

In [23]:
gs.fit(X_train_res, y_train_res)

GridSearchCV(estimator=LogisticRegression(random_state=10),
             param_grid={'C': [0.001, 0.01, 0.1, 1],
                         'class_weight': [None, {0: 1.0, 1: 1.0}],
                         'max_iter': [10, 50, 100, 200],
                         'penalty': ['l1', 'l2']},
             scoring='precision')

In [24]:
gs.best_params_

{'C': 1, 'class_weight': None, 'max_iter': 200, 'penalty': 'l2'}

Changing the weights and regularization

In [25]:
logreg_weights = LogisticRegression(random_state=10, class_weight=weights, C=.001)

In [26]:
modeling_function(logreg_weights, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.8067226890756303
Validation Precision:  0.7608695652173914
Training F1:  0.40764331210191085
Validation F1:  0.430327868852459
Training Accuracy:  0.7886897248169654
Validation Accuracy:  0.789553368660106


LogisticRegression(C=0.001, class_weight={0: 1.0, 1: 1.0}, random_state=10)

#### Creating a XGBoost model

In [27]:
import xgboost as xgb

In [28]:
boost = xgb.XGBClassifier()

In [29]:
modeling_function(boost, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.9623188405797102
Validation Precision:  0.6394557823129252
Training F1:  0.9526542324246772
Validation F1:  0.5838509316770186
Training Accuracy:  0.9750063115374905
Validation Accuracy:  0.7971233913701741


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

#### Creating a SVC model

In [30]:
from sklearn.svm import SVC

In [31]:
vec_gs = SVC(random_state=10)

In [32]:
vec_gs.fit(X_train_sc_df, y_train_enc_df)

SVC(random_state=10)

In [33]:
y_pred_try = vec_gs.predict(X_val_sc_df)

In [34]:
print(confusion_matrix(y_val_enc_df, y_pred_try))
print(classification_report(y_val_enc_df, y_pred_try))

[[894  77]
 [178 172]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.88       971
           1       0.69      0.49      0.57       350

    accuracy                           0.81      1321
   macro avg       0.76      0.71      0.72      1321
weighted avg       0.80      0.81      0.80      1321



In [35]:
vec = SVC(kernel='poly', degree=8, C=.01, random_state=10)

In [36]:
vec.fit(X_train_sc_df, y_train_enc_df)

SVC(C=0.01, degree=8, kernel='poly', random_state=10)

In [37]:
y_pred = vec.predict(X_val_sc_df)

In [38]:

print(confusion_matrix(y_val_enc_df, y_pred))
print(classification_report(y_val_enc_df, y_pred))

[[954  17]
 [296  54]]
              precision    recall  f1-score   support

           0       0.76      0.98      0.86       971
           1       0.76      0.15      0.26       350

    accuracy                           0.76      1321
   macro avg       0.76      0.57      0.56      1321
weighted avg       0.76      0.76      0.70      1321



#### Running a Grid Search on the SVC parameters

In [39]:
vec_params = {'kernel':['linear', 'poly', 'rbf'],
              'C': [.0001, .001, .01, .1, 1],
              'degree': [3, 4, 5, 6, 7, 8],
              'max_iter': [-1, 5, 10, 50, 100],
              'shrinking': [True, False],
              'gamma': ['scale', 'auto'],
              'probability': [True, False]}

In [40]:
vec_grid = GridSearchCV(vec_gs, vec_params, scoring='precision')

In [41]:
vec_grid.fit(X_train_sc_df, y_train_enc_df)

GridSearchCV(estimator=SVC(random_state=10),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1],
                         'degree': [3, 4, 5, 6, 7, 8],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf'],
                         'max_iter': [-1, 5, 10, 50, 100],
                         'probability': [True, False],
                         'shrinking': [True, False]},
             scoring='precision')

In [42]:
vec_grid.best_params_

{'C': 0.01,
 'degree': 4,
 'gamma': 'auto',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': True,
 'shrinking': True}

In [43]:
best_vec = vec_grid.best_estimator_

In [44]:
modeling_function(best_vec, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.9523809523809523
Validation Precision:  0.8125
Training F1:  0.03714020427112349
Validation F1:  0.07103825136612021
Training Accuracy:  0.7381974248927039
Validation Accuracy:  0.7426192278576835


SVC(C=0.01, degree=4, gamma='auto', kernel='poly', probability=True,
    random_state=10)

## Best Model (Not used in presentation model because of complexity and roc_auc_score)

In [83]:
vec_best = SVC(kernel='poly', degree=2, C=.01, random_state=10)

In [55]:
modeling_function(vec_best, X_train_sc_df, y_train_enc_df, X_val_sc_df, y_val_enc_df)

Training Precision:  0.8741258741258742
Validation Precision:  0.8181818181818182
Training F1:  0.2085070892410342
Validation F1:  0.2222222222222222
Training Accuracy:  0.7604140368593789
Validation Accuracy:  0.761544284632854


SVC(C=0.01, kernel='poly', random_state=10)

In [58]:
try_X_train_df = pd.read_csv('new_X_train_sc.csv')
try_X_train_df.drop('customerid', axis=1, inplace=True)
try_X_val_df = pd.read_csv('new_X_val_sc.csv')
try_X_val_df.drop('customerid', axis=1, inplace=True)
try_y_train_df = pd.read_csv('new_y_train_enc.csv')
try_y_val_df = pd.read_csv('new_y_val_enc.csv')

In [84]:
modeling_function(vec_best, try_X_train_df, try_y_train_df, try_X_val_df, try_y_val_df)

Training Precision:  0.9010989010989011
Validation Precision:  0.8717948717948718
Training F1:  0.14298169136878816
Validation F1:  0.1748071979434447
Training Accuracy:  0.7518303458722545
Validation Accuracy:  0.757002271006813


SVC(C=0.01, degree=2, kernel='poly', random_state=10)

In [91]:
roc_auc_score(try_y_val_df, vec_best.predict(try_X_val_df))

0.5459967632779168

In [85]:
from sklearn.metrics import roc_auc_score

In [86]:
X_test_df = pd.read_csv('new_X_test_sc.csv')
X_test_df.drop('customerid', axis=1, inplace=True)
y_test_df = pd.read_csv('new_y_test_enc.csv')

In [87]:
 test_preds = vec_best.predict(X_test_df)

In [88]:
modeling_function(vec_best, try_X_train_df, try_y_train_df, X_test_df, y_test_df)

Training Precision:  0.9010989010989011
Validation Precision:  0.8205128205128205
Training F1:  0.14298169136878816
Validation F1:  0.12749003984063745
Training Accuracy:  0.7518303458722545
Validation Accuracy:  0.7512776831345827


SVC(C=0.01, degree=2, kernel='poly', random_state=10)

In [82]:
roc_auc_score(y_test_df, test_preds)

0.5310903633102264