In [102]:
import joblib
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [42]:
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("customer_churn_data.csv")

In [5]:
data.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'MonthlyCharges',
       'ContractType', 'InternetService', 'TotalCharges', 'TechSupport',
       'Churn'],
      dtype='object')

In [6]:
y = data[["Churn"]]
X = data[["Age", "Gender", "Tenure", "MonthlyCharges"]]

In [12]:
X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,0,4,88.35
1,43,0,0,36.67
2,51,1,2,63.79
3,60,1,8,102.34
4,42,0,32,69.01
...,...,...,...,...
995,42,0,41,37.14
996,62,0,9,80.93
997,51,1,15,111.72
998,39,0,68,65.67


In [11]:
X["Gender"] = X["Gender"].apply(lambda x: 1 if x == 'Female' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Gender"] = X["Gender"].apply(lambda x: 1 if x == 'Female' else 0)


In [13]:
y

Unnamed: 0,Churn
0,Yes
1,Yes
2,Yes
3,Yes
4,Yes
...,...
995,Yes
996,Yes
997,Yes
998,Yes


In [15]:
y["Churn"] = y["Churn"].apply(lambda x:1 if x == "Yes" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["Churn"] = y["Churn"].apply(lambda x:1 if x == "Yes" else 0)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
X_train

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
554,48,0,21,67.59
148,50,1,11,118.30
452,35,1,33,66.22
618,49,0,5,116.92
206,50,0,17,119.75
...,...,...,...,...
376,53,1,24,95.21
265,32,0,24,104.01
135,60,1,35,119.16
932,30,0,2,71.96


In [22]:
scaler = StandardScaler()

In [23]:
X_train = scaler.fit_transform(X_train)

In [24]:
X_train

array([[ 0.33670116, -1.05925253,  0.114604  , -0.27111756],
       [ 0.53938056,  0.94406194, -0.41720109,  1.71671137],
       [-0.98071497,  0.94406194,  0.7527701 , -0.32482148],
       ...,
       [ 1.55277759,  0.94406194,  0.85913111,  1.75042332],
       [-1.48741348, -1.05925253, -0.89582566, -0.09981383],
       [ 0.13402176,  0.94406194,  0.22096501, -0.91987656]])

In [25]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [26]:
X_test = scaler.fit_transform(X_test)

In [27]:
X_test

array([[-0.17497509,  0.85972695, -0.39008203,  1.74109294],
       [-0.28038177,  0.85972695,  0.44376065, -1.32056676],
       [-0.80741516, -1.16316   ,  1.69452467, -0.46758398],
       [ 0.66827834,  0.85972695,  0.18318481,  1.73504073],
       [ 1.30071842,  0.85972695, -0.7548882 , -0.65444584],
       [-0.91282184,  0.85972695,  0.07895448,  0.99213245],
       [ 0.03583827,  0.85972695, -0.65065787, -0.59997598],
       [-0.5966018 , -1.16316   ,  0.96491233, -0.18993903],
       [ 0.66827834,  0.85972695, -0.9112337 , -0.38360962],
       [-0.49119512, -1.16316   ,  0.18318481, -0.8810253 ],
       [-0.28038177, -1.16316   , -0.39008203,  0.21858488],
       [ 1.08990506, -1.16316   ,  1.38183367, -1.57400288],
       [-1.96688863, -1.16316   , -0.4421972 ,  1.66544037],
       [ 0.8790917 ,  0.85972695,  0.28741515, -0.99299111],
       [-0.06956841,  0.85972695,  0.02683931, -1.18401386],
       [-1.22904187, -1.16316   , -0.12950619,  1.54401798],
       [-1.54526191,  0.

In [31]:
def modelperformance(prediction):
    print(f"Accuracy score on the model is: {accuracy_score(y_test, prediction)}")

In [36]:
log_model = LogisticRegression()
log_model

In [43]:
log_model.fit(X_train, y_train)

In [44]:
log_model_pred = log_model.predict(X_test)

In [45]:
modelperformance(log_model_pred)

Accuracy score on the model is: 0.89


In [51]:
knc = KNeighborsClassifier()
knc

In [63]:
param_grid = {
    'n_neighbors':[3,5,7,9,11],
    'weights': ['uniform', 'distance']
}

In [64]:
gridkn = GridSearchCV(knc,param_grid,cv=5)

In [71]:
gridkn.best_params_

{'n_neighbors': 5, 'weights': 'uniform'}

In [65]:
gridkn.fit(X_train, y_train)

In [69]:
kn_pred = gridkn.predict(X_test)

In [70]:
modelperformance(kn_pred)

Accuracy score on the model is: 0.88


In [73]:
svc = SVC()
svc

In [74]:
param_grid = {
    "C":[0.01, 0.1, 0.5, 1],
    "kernel": ['linear', 'rbf', 'poly']
}

In [81]:
gridsvc = GridSearchCV(svc, param_grid, cv=5)

In [82]:
gridsvc.fit(X_train, y_train)

In [83]:
gridsvc.best_params_

{'C': 0.01, 'kernel': 'linear'}

In [84]:
svc_pred = gridsvc.predict(X_test)

In [85]:
modelperformance(svc_pred)

Accuracy score on the model is: 0.9


In [90]:
dt = DecisionTreeClassifier()
dt

In [96]:
param_grid = {
    'criterion':['entropy', 'gini'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [97]:
griddt = GridSearchCV(dt, param_grid, cv=5)

In [98]:
griddt.fit(X_train, y_train)

In [99]:
griddt.best_params_

{'criterion': 'gini',
 'max_depth': 30,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'splitter': 'random'}

In [100]:
dt_pred = griddt.predict(X_test)

In [101]:
modelperformance(dt_pred)

Accuracy score on the model is: 0.845


In [103]:
rfc = RandomForestClassifier()
rfc

In [112]:
param_grid = {
    'n_estimators': [32, 64, 128, 256],
    'max_features': [2,3,4],
    'bootstrap': [True, False]
}

In [113]:
gridrfc = GridSearchCV(rfc, param_grid, cv=5)

In [114]:
gridrfc.fit(X_train, y_train)

In [115]:
gridrfc.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 256}

In [116]:
rfc_pred = gridrfc.predict(X_test)

In [117]:
modelperformance(rfc_pred)

Accuracy score on the model is: 0.885


In [122]:
best_model = gridsvc.best_estimator_
best_model

In [123]:
joblib.dump(best_model, 'model.pkl')

['model.pkl']