In [2]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
from matplotlib import pyplot as plt


from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mutual_info_score, roc_curve, auc, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression



In [3]:
df = pd.read_csv('data.csv')

In [4]:
"""
All the pre-processing and cleaning
"""

df.TotalCharges = pd.to_numeric(df.TotalCharges, errors ='coerce')        #coerce -> replace all nonnumeric values with NaN
df.TotalCharges = df.TotalCharges.fillna(0)
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical = ['tenure','monthlycharges', 'totalcharges']
categorical = [ 'gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [5]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

# now spliting it in train and validation
df_train, df_val = train_test_split(df_train_full, test_size= 0.33, random_state= 11)

y_train = df_train.churn.values
y_val = df_val.churn.values


del df_train['churn']
del df_val['churn']


In [6]:
def train(df,y,C):                                                      #We add C to the parameters
    cat = df[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(cat)

    X = dv.transform(cat)

    model = LogisticRegression(solver='liblinear', C=C)                 #which we use for the training 
    model.fit(X,y)

    return dv, model

In [7]:
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(cat)
    y_pred = model.predict_proba(X)[:,1]

    return y_pred


In [8]:
def predict_single(customer, dv,model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

In [10]:
#We loop over different values of C,
#For each C we run cross-validation and record the mean AUC accross all folds as weel as standard deviation

nfolds = 5
kfold = KFold(n_splits=nfolds, shuffle=True, random_state=1)    #split the dataset in 10 parts
for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    aucs = []                                                   #to add the auc scores

    for train_idx, val_idx in kfold.split(df_train_full):       # to iterate over the 10 parts
        df_train = df_train_full.iloc[train_idx]                #split the data into train and validation
        df_val = df_train_full.iloc[val_idx]

        y_train = df_train.churn.values
        y_val = df_val.churn.values

        dv, model = train(df_train, y_train, C)                    #trains the models and make predictions
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)                      #evaluates the quality of the model 
        aucs.append(auc)
    print('C=%s, auc = %0.3f ± %0.3f' % (C, np.mean(aucs), np.std(aucs)))



C=0.001, auc = 0.825 ± 0.013
C=0.01, auc = 0.839 ± 0.009
C=0.1, auc = 0.841 ± 0.008
C=0.5, auc = 0.841 ± 0.007
C=1, auc = 0.841 ± 0.007
C=10, auc = 0.841 ± 0.007


In [11]:
# AUC of the model, 85.9% accuracy

y_train = df_train_full.churn.values
y_test = df_test.churn.values

dv, model = train(df_train_full, y_train, C=0.5)
y_pred = predict(df_test, dv, model)

auc= roc_auc_score(y_test, y_pred)
print('auc = %.3f' % auc)

auc = 0.859


<h3> Saving the model to use it externally

In [12]:
with open('churn-model.bin', 'wb') as f_out:
    pickle.dump((dv, model),f_out)

f_out = open('churn-model.bin', 'wb')
pickle.dump((dv,model), f_out)
f_out.close()