In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc


from regression_module import *
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
df = pd.read_csv('data/final_df.csv')

In [32]:
df = df.iloc[:,2:]

In [39]:
y = df.churn
X = df.drop('churn', axis = 1)
cols = X.columns

In [36]:
mm = MinMaxScaler()
scaled_X = mm.fit_transform(X)

In [42]:
scaled_df = pd.DataFrame(scaled_X)
scaled_df.columns = cols
scaled_df

Unnamed: 0,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,phoneservice_No,...,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0.013889,0.115423,0.003437,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.472222,0.385075,0.217564,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.027778,0.354229,0.012453,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.625000,0.239303,0.211951,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.027778,0.521891,0.017462,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.333333,0.662189,0.229194,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.000000,0.845274,0.847792,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,0.152778,0.112935,0.039892,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,0.055556,0.558706,0.035303,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, target, test_size = .25, random_state = 33)

In [49]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_hat_train = clf.predict(X_train)
y_hat_test = clf.predict(X_test)

In [51]:
# Find residual differences between train data and predicted train data
residuals = np.abs(y_train - y_hat_train)
# Print value counts of our predicted values 
print(pd.Series(residuals).value_counts())
print('----------------------------------')
# Print normalized value counts of our predicted values
print(pd.Series(residuals).value_counts(normalize = True))

0    4393
1     889
Name: churn, dtype: int64
----------------------------------
0    0.831693
1    0.168307
Name: churn, dtype: float64


In [52]:
# Repeat previous step with test data
residuals = np.abs(y_test - y_hat_test)
print(pd.Series(residuals).value_counts())
print('---------------------------------')
print(pd.Series(residuals).value_counts(normalize = True))

0    1326
1     435
Name: churn, dtype: int64
---------------------------------
0    0.752981
1    0.247019
Name: churn, dtype: float64


In [60]:
print_metrics(y_train, y_hat_train, y_test, y_hat_test)

Training Precision:  0.6929012345679012
Testing Precision:  0.5488069414316703


Training Recall:  0.646508279337653
Testing Recall:  0.5270833333333333


Training Accuracy:  0.8316925407042787
Testing Accuracy:  0.7529812606473595


Training F1-Score:  0.668901303538175
Testing F1-Score:  0.5377258235919234


In [71]:
find_best_k(X_train, y_train, X_test, y_test)

Best Value for K: 31
F1-Score: 0.5904139433551199


In [75]:
grid_params = {'n_neighbors': [3, 5, 11, 19],
               'weights':['uniform', 'distance'],
               'metric':['euclidean', 'manhattan']
              }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv = 3, n_jobs = -1)
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   13.1s finished


In [76]:
gs_results.best_score_

0.7856872396819387

In [77]:
gs_results.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [78]:
gs_results.best_params_

{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}

In [79]:
y_hat_train2 = clf.predict(X_train)
y_hat_test2 = clf.predict(X_test)

In [80]:
# Find residual differences between train data and predicted train data
residuals2 = np.abs(y_train - y_hat_train2)
# Print value counts of our predicted values 
print(pd.Series(residuals2).value_counts())
print('----------------------------------')
# Print normalized value counts of our predicted values
print(pd.Series(residuals2).value_counts(normalize = True))

0    4393
1     889
Name: churn, dtype: int64
----------------------------------
0    0.831693
1    0.168307
Name: churn, dtype: float64


In [81]:
# Repeat previous step with test data
residuals2 = np.abs(y_test - y_hat_test2)
print(pd.Series(residuals2).value_counts())
print('---------------------------------')
print(pd.Series(residuals2).value_counts(normalize = True))

0    1326
1     435
Name: churn, dtype: int64
---------------------------------
0    0.752981
1    0.247019
Name: churn, dtype: float64


In [82]:
print_metrics(y_train, y_hat_train2, y_test, y_hat_test2)

Training Precision:  0.6929012345679012
Testing Precision:  0.5488069414316703


Training Recall:  0.646508279337653
Testing Recall:  0.5270833333333333


Training Accuracy:  0.8316925407042787
Testing Accuracy:  0.7529812606473595


Training F1-Score:  0.668901303538175
Testing F1-Score:  0.5377258235919234
