Import Libraries

In [17]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, KFold, RepeatedStratifiedKFold
from tpot import TPOTClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

Load Dataset

In [3]:
path = './data/Telco-Customer-Churn.csv'
data = pd.read_csv(path)
data.columns = data.columns.str.replace(' ', '_').str.lower()

categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()

for col in categorical_col:
    data[col] = data[col].str.replace(' ', '_').str.lower()
#data.head()

EDA

In [4]:
#data.info()

In [5]:
#data.describe()

In [6]:
data['churn'] = (data.churn=='yes').astype(int)
categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
numerical_col = ['tenure', 'totalcharges', 'monthlycharges']

categorical_col.remove('customerid')

In [7]:
# train_data, test_data = train_test_split(data, test_size=0.2,
#                                          random_state=0)

# train_x = train_data.drop(['churn'], axis = 1)
# test_x = test_data.drop(['churn'], axis = 1)

# train_y = train_data.pop('churn')
# test_y = test_data.pop('churn')

In [8]:
dv = DictVectorizer(sparse = False)
dv.fit(data[categorical_col + numerical_col].to_dict(orient = 'records'))
feature_names = dv.feature_names_
train_x = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))
train_y = data['churn']

new_df = pd.DataFrame(train_x, columns=feature_names)

In [20]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, 
                             random_state=1)
model = TPOTClassifier(generations=3, population_size=30, 
                       cv=cv, scoring='accuracy', 
                       verbosity=2, random_state=1,
                        n_jobs=-1)

model.fit(new_df, train_y)
model.export('churn_best_model.py')

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]

In [10]:
model.export('churn_best_model.py')

In [18]:
pred = model.predict(new_df)
accuracy_score(train_y, pred), precision_score(train_y, pred), recall_score(train_y, pred), f1_score(train_y, pred)

(0.7346301292063041, 0.0, 0.0, 0.0)

In [19]:
confusion_matrix(train_y, pred)

array([[5174,    0],
       [1869,    0]], dtype=int64)

In [14]:
data['churn'].value_counts()

0    5174
1    1869
Name: churn, dtype: int64

array([0, 0, 0, ..., 0, 0, 0])

In [101]:
def metric(actual, predicted, t):

    accuracy = (predicted == actual).mean()
    actual_positive = (actual == 1)
    actual_negative = (actual == 0)

    predicted_positive = (predicted >= t)
    predicted_negative = (predicted < t)



    tp = (actual_positive & predicted_positive).sum()
    tn = (actual_negative & predicted_negative).sum()
    fp = (actual_negative & predicted_positive).sum()
    fn = (actual_positive & predicted_negative).sum()

    tpr = tp/ (tp + fn)
    fpr = fp/ (fp + tn)

    precision = tp/(tp + fp)
    recall = tp/(tp +fn)
    f1_score = 2 * ((precision * recall)/ (precision + recall))

    return tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score#, accuracy

In [102]:
def model_rates(y_test, prediction):

    actual_positive = (y_test == 1)
    actual_negative = (y_test == 0)

    predicted_positive = (prediction >= t)
    predicted_negative = (prediction < t)

    true_positive = (actual_positive & predicted_positive).sum()
    true_negative = (actual_negative & predicted_negative).sum()
    false_positive = (actual_negative & predicted_positive).sum()
    false_negative = (actual_positive & predicted_negative).sum()

    score.append((t,true_positive, false_positive, false_negative, true_negative))

    df = pd.DataFrame(score, columns= ['threshold','true_positive', 'false_positive', 'false_negative', 'true_negative'])
    df.true_positive_rate = df.true_positive/(df.true_positive + df.false_negative)
    df.false_positive_rate = df.false_positive/(df.true_negative + df.false_positive)

    return df

In [103]:
dv = DictVectorizer(sparse = False)
def train(data, y, c):
    dv.fit(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    X_train = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))

    model = LogisticRegression(C = c, max_iter = 1000)
    model.fit(X_train, y)
    return dv, model

def predict(data, dv, model):
    X_test = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    prediction = model.predict_proba(X_test)[:,1]
    return prediction

In [104]:

dv, model = train(train_data, y_train, c = 1)
prediction = predict(test_data, dv, model)

tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score = metric(y_test, prediction, 0.5)
cm = np.array([[tn, fp], [fn, tp]])

In [105]:
roc_auc_score(y_test,prediction)

0.8581973739803048

In [66]:
import pickle

out = 'Churn.bin'

with open(out, 'wb') as f:
    pickle.dump((dv,model), f)

In [68]:
with open(out, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [111]:
test = data.iloc[8].to_dict()