Import Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn
import seaborn as sns
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score, roc_curve, auc, roc_auc_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

Load Dataset

In [3]:
data = pd.read_csv("C:/Users/Godwin/Documents/Workflow/MLZoomcamp/Classification/Customer-churn/Telco-Customer-Churn.csv")
data.columns = data.columns.str.replace(' ', '_').str.lower()

categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()

for col in categorical_col:
    data[col] = data[col].str.replace(' ', '_').str.lower()
#data.head()

EDA

In [4]:
data['totalcharges'] = pd.to_numeric(data['totalcharges'], errors= 'coerce')
#Filling null values
data['totalcharges'].fillna(data['totalcharges'].mean(), inplace = True)

In [99]:
data['churn'] = (data.churn=='yes').astype(int)
categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
numerical_col = ['tenure', 'totalcharges', 'monthlycharges']

categorical_col.remove('customerid')


In [100]:
categorical_col

['gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [101]:
def metric(actual, predicted, t):

    accuracy = (predicted == actual).mean()
    actual_positive = (actual == 1)
    actual_negative = (actual == 0)

    predicted_positive = (predicted >= t)
    predicted_negative = (predicted < t)



    tp = (actual_positive & predicted_positive).sum()
    tn = (actual_negative & predicted_negative).sum()
    fp = (actual_negative & predicted_positive).sum()
    fn = (actual_positive & predicted_negative).sum()

    tpr = tp/ (tp + fn)
    fpr = fp/ (fp + tn)

    precision = tp/(tp + fp)
    recall = tp/(tp +fn)
    f1_score = 2 * ((precision * recall)/ (precision + recall))

    return tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score#, accuracy

In [102]:
def model_rates(y_test, prediction):

    actual_positive = (y_test == 1)
    actual_negative = (y_test == 0)

    predicted_positive = (prediction >= t)
    predicted_negative = (prediction < t)

    true_positive = (actual_positive & predicted_positive).sum()
    true_negative = (actual_negative & predicted_negative).sum()
    false_positive = (actual_negative & predicted_positive).sum()
    false_negative = (actual_positive & predicted_negative).sum()

    score.append((t,true_positive, false_positive, false_negative, true_negative))

    df = pd.DataFrame(score, columns= ['threshold','true_positive', 'false_positive', 'false_negative', 'true_negative'])
    df.true_positive_rate = df.true_positive/(df.true_positive + df.false_negative)
    df.false_positive_rate = df.false_positive/(df.true_negative + df.false_positive)

    return df

In [103]:
dv = DictVectorizer(sparse = False)
def train(data, y, c):
    dv.fit(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    X_train = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))

    model = LogisticRegression(C = c, max_iter = 1000)
    model.fit(X_train, y)
    return dv, model

def predict(data, dv, model):
    X_test = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    prediction = model.predict_proba(X_test)[:,1]
    return prediction

In [104]:
train_data,test_data = train_test_split(data, test_size = 0.2, random_state = 1)

y_train = train_data.pop('churn')
y_test = test_data.pop('churn')

dv, model = train(train_data, y_train, c = 1)
prediction = predict(test_data, dv, model)

tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score = metric(y_test, prediction, 0.5)
cm = np.array([[tn, fp], [fn, tp]])

In [105]:
roc_auc_score(y_test,prediction)

0.8581973739803048

In [66]:
import pickle

out = 'Churn.bin'

with open(out, 'wb') as f:
    pickle.dump((dv,model), f)

In [68]:
with open(out, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [111]:
test = data.iloc[8].to_dict()

In [113]:
test

{'customerid': '7892-pookp',
 'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'tenure': 28,
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'monthlycharges': 104.8,
 'totalcharges': 3046.05,
 'churn': 1}