# Importing all the libraries and scrypts

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import warnings
import random
import pickle

warnings.filterwarnings('ignore')
sys.path.append('..')

from scripts import dataframe as dfr
from scripts import matrix as mx
from scripts import regression as rgr
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score


# Training the model

In [31]:
df = pd.read_csv('../week_3/churn_data.csv')

df = dfr.data_frame_refining(df)

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

numerical_cols = ['tenure', 'monthlycharges', 'totalcharges']
categorical_cols = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [41]:
C = 1
n_splits = 5
kfold = KFold(n_splits=5,shuffle=True,random_state = 1)
scores = []

for train_idx , val_idx in kfold.split(df_full_train):

    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv , model = dfr.train(df_train,y_train,categorical_cols,numerical_cols,C)
    y_pred = dfr.predict(df_val,categorical_cols,numerical_cols,dv,model)

    auc = roc_auc_score(y_val,y_pred)
    scores.append(auc)

    print(f'C={C} +- mean_score={np.mean(scores)} +- std_score={np.std(scores)}')

C=1 +- mean_score=0.8420100321058787 +- std_score=0.0
C=1 +- mean_score=0.843274210530492 +- std_score=0.0012641784246132937
C=1 +- mean_score=0.839184613161207 +- std_score=0.005874950610734013
C=1 +- mean_score=0.8369315667648107 +- std_score=0.006412093082751174
C=1 +- mean_score=0.839320284003923 +- std_score=0.007464303742159675


In [42]:
C = 1.0
dv,model = dfr.train(df_full_train,df_full_train.churn.values,categorical_cols,numerical_cols,C)
y_pred_final = dfr.predict(df_test,categorical_cols,numerical_cols,dv,model)
auc = roc_auc_score(y_test,y_pred_final)
auc

0.8572386167896259

# Saving the model

In [33]:
output =f'model_C=({C}).bin'
output

'model_C=(1.0).bin'

In [34]:
with open(output,'wb') as f_out:
    pickle.dump((dv,model),f_out)

In [35]:
f_in = 'model_C=(1.0).bin'

# Loading the model

In [36]:
with open(f_in,'rb') as f_in:
    (dv,model) = pickle.load(f_in)

In [37]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85,
}

In [38]:
X = dv.transform([customer])

In [39]:
model.predict_proba(X)[0,1]

0.6363584152747782