Import Libraries

In [1]:
import pandas as pd
import numpy as np

import mlflow
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe
from hyperopt.pyll import scope

from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, KFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('Telcom Churn')

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1', creation_time=1687473013668, experiment_id='1', last_update_time=1687473013668, lifecycle_stage='active', name='Telcom Churn', tags={}>

Load Dataset

In [4]:
path = './data/Telco-Customer-Churn.csv'
data = pd.read_csv(path)
data.columns = data.columns.str.replace(' ', '_').str.lower()

categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
for col in categorical_col:
    data[col] = data[col].str.replace(' ', '_').str.lower()
#data.head()

EDA

In [5]:
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [6]:
data['churn'] = (data.churn=='yes').astype(int)
categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
numerical_col = ['tenure', 'totalcharges', 'monthlycharges']

categorical_col.remove('customerid')

In [7]:
train_data, test_data = train_test_split(data, test_size=0.25,
                                         random_state=0)

train_x = train_data.drop(['churn'], axis = 1)
test_x = test_data.drop(['churn'], axis = 1)

train_y = train_data.pop('churn')
test_y = test_data.pop('churn')

In [8]:
dv = DictVectorizer(sparse = False)
dv.fit(train_x[categorical_col + numerical_col].to_dict(orient = 'records'))

train_x = dv.transform(train_x[categorical_col + numerical_col].to_dict(orient = 'records'))
test_x = dv.transform(test_x[categorical_col + numerical_col].to_dict(orient = 'records'))

In [9]:
def evaluation(y_true, y_pred, data_category):

    accuracy_ = accuracy_score(y_true, y_pred)
    precision_ = precision_score(y_true, y_pred)
    recall_ = recall_score(y_true, y_pred)
    f1score_ = f1_score(y_true, y_pred)

    if data_category == 'train':

        out = {"Train accuracy Score" : accuracy_, 
            "Train precision Score" :precision_, 
            "Train recall Score" : recall_, 
            "Train f1 Score" : f1score_}
    else:
        out = {"Test accuracy Score" : accuracy_, 
            "Test precision Score" :precision_, 
            "Test recall Score" : recall_, 
            "Test f1 Score" : f1score_}
    return out


In [None]:
def linear_model(c_values = range(1, 101, 10), model_tag = None):

    for c_value in c_values:

        with mlflow.start_run():
            
            mlflow.set_tag('Developer', 'Godwin')
            mlflow.set_tag('Model Type', model_tag)
            mlflow.set_tag('Model Name', 'Logistic Regression')
            mlflow.log_param('C Value', c_value)

            model = LogisticRegression(C = c_value)
            model.fit(train_x, train_y)

            train_pred = model.predict(train_x)
            train_output_eval = evaluation(train_y, train_pred, 'train')
            test_pred = model.predict(test_x)
            test_output_eval = evaluation(test_y, test_pred, 'test')   
            data.to_csv('new_data.csv', header=False)    
            
            mlflow.log_metrics(train_output_eval)
            mlflow.log_metrics(test_output_eval)
            mlflow.log_artifact('new_data.csv', 'data.csv')

In [13]:
def single_tree_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('Model Type', 'Base Model')
        mlflow.set_tag("model", "Decision Tree")
        mlflow.log_params(params)

        model = DecisionTreeClassifier(**params)
        model.fit(train_x, train_y)
        train_pred = model.predict(train_x)
        train_output_eval = evaluation(train_y, train_pred, 'train')
        test_pred = model.predict(test_x)
        test_output_eval = evaluation(test_y, test_pred, 'test')   
        data.to_csv('new_data.csv', header=False)    
        mlflow.log_metrics(train_output_eval)
        mlflow.log_metrics(test_output_eval)
        mlflow.log_artifact('new_data.csv', 'data.csv')

    return {"loss": test_output_eval['Test accuracy Score'], 'status': STATUS_OK}

def random_forest_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('Model Type', 'Base Model')
        mlflow.set_tag("model", "Random Forest")
        mlflow.log_params(params)

        model = RandomForestClassifier(**params)
        model.fit(train_x, train_y)
        train_pred = model.predict(train_x)
        train_output_eval = evaluation(train_y, train_pred, 'train')
        test_pred = model.predict(test_x)
        test_output_eval = evaluation(test_y, test_pred, 'test')   
        data.to_csv('new_data.csv', header=False)    
        mlflow.log_metrics(train_output_eval)
        mlflow.log_metrics(test_output_eval)
        mlflow.log_artifact('new_data.csv', 'data.csv')

    return {"loss": test_output_eval['Test accuracy Score'], 'status': STATUS_OK}


def single_tree():

    space = {
    "max_depth": hp.randint("max_depth", 1, 15),
    'min_samples_split': hp.randint("min_samples_split", 2, 15),
    'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    }

    best_result = fmin(
    fn= single_tree_objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
    )
    return best_result

def random_forest(): 

    space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'min_samples_split': hp.randint("min_samples_split", 2, 15),
    'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    }

    best_result = fmin(
    fn=random_forest_objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
    )
    return best_result


In [16]:
train = xgb.DMatrix(train_x, label=train_y)
valid = xgb.DMatrix(test_x, label=test_y)
def xgboost_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('Model Type', 'Base Model')
        mlflow.set_tag("model", "Xgboost")
        mlflow.log_params(params)
        model = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        train_pred = model.predict(train)
        train_output_eval = evaluation(train_y, train_pred, 'train')
        test_pred = model.predict(valid)
        test_output_eval = evaluation(test_y, test_pred, 'test')   
        data.to_csv('new_data.csv', header=False)    
        mlflow.log_metrics(train_output_eval)
        mlflow.log_metrics(test_output_eval)
        mlflow.log_artifact('new_data.csv', 'data.csv')
    return {'loss': test_output_eval['Test accuracy Score'], 'status': STATUS_OK}


def xgboost_dev():

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'binary:logistic',
        'seed': 42
    }

    best_result = fmin(
        fn= xgboost_objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=Trials()
    )
    return best_result

In [10]:
linear_model()

In [11]:
single_tree()

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:59<00:00,  1.19s/trial, best loss: 0.7370812038614424]


{'criterion': 0, 'max_depth': 1, 'min_samples_leaf': 8, 'min_samples_split': 3}

In [14]:
random_forest()

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [04:18<00:00,  5.17s/trial, best loss: 0.7370812038614424]


{'criterion': 1,
 'max_depth': 65.0,
 'min_samples_leaf': 7,
 'min_samples_split': 8,
 'n_estimators': 3}

In [17]:
xgboost_dev()

[0]	validation-logloss:0.65273                        
[1]	validation-logloss:0.61922                        
[2]	validation-logloss:0.59169                        
[3]	validation-logloss:0.56816                        
[4]	validation-logloss:0.54839                        
[5]	validation-logloss:0.53190                        
[6]	validation-logloss:0.51766                        
[7]	validation-logloss:0.50551                        
[8]	validation-logloss:0.49503                        
[9]	validation-logloss:0.48650                        
[10]	validation-logloss:0.47842                       
[11]	validation-logloss:0.47223                       
[12]	validation-logloss:0.46715                       
[13]	validation-logloss:0.46259                       
[14]	validation-logloss:0.45916                       
[15]	validation-logloss:0.45631                       
[16]	validation-logloss:0.45293                       
[17]	validation-logloss:0.45112                       
[18]	valid

ERROR [hyperopt.fmin] job exception: Classification metrics can't handle a mix of binary and continuous targets


  0%|          | 0/50 [01:43<?, ?trial/s, best loss=?]


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
def metric(actual, predicted, t):

    accuracy = (predicted == actual).mean()
    actual_positive = (actual == 1)
    actual_negative = (actual == 0)

    predicted_positive = (predicted >= t)
    predicted_negative = (predicted < t)



    tp = (actual_positive & predicted_positive).sum()
    tn = (actual_negative & predicted_negative).sum()
    fp = (actual_negative & predicted_positive).sum()
    fn = (actual_positive & predicted_negative).sum()

    tpr = tp/ (tp + fn)
    fpr = fp/ (fp + tn)

    precision = tp/(tp + fp)
    recall = tp/(tp +fn)
    f1_score = 2 * ((precision * recall)/ (precision + recall))

    return tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score#, accuracy

In [None]:
def model_rates(y_test, prediction):

    actual_positive = (y_test == 1)
    actual_negative = (y_test == 0)

    predicted_positive = (prediction >= t)
    predicted_negative = (prediction < t)

    true_positive = (actual_positive & predicted_positive).sum()
    true_negative = (actual_negative & predicted_negative).sum()
    false_positive = (actual_negative & predicted_positive).sum()
    false_negative = (actual_positive & predicted_negative).sum()

    score.append((t,true_positive, false_positive, false_negative, true_negative))

    df = pd.DataFrame(score, columns= ['threshold','true_positive', 'false_positive', 'false_negative', 'true_negative'])
    df.true_positive_rate = df.true_positive/(df.true_positive + df.false_negative)
    df.false_positive_rate = df.false_positive/(df.true_negative + df.false_positive)

    return df

In [103]:
dv = DictVectorizer(sparse = False)
def train(data, y, c):
    dv.fit(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    X_train = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))

    model = LogisticRegression(C = c, max_iter = 1000)
    model.fit(X_train, y)
    return dv, model

def predict(data, dv, model):
    X_test = dv.transform(data[categorical_col + numerical_col].to_dict(orient = 'records'))
    prediction = model.predict_proba(X_test)[:,1]
    return prediction

In [104]:

dv, model = train(train_data, y_train, c = 1)
prediction = predict(test_data, dv, model)

tn, fp, fn, tp, precision, recall, tpr, fpr, f1_score = metric(y_test, prediction, 0.5)
cm = np.array([[tn, fp], [fn, tp]])

In [105]:
roc_auc_score(y_test,prediction)

0.8581973739803048

In [66]:
import pickle

out = 'Churn.bin'

with open(out, 'wb') as f:
    pickle.dump((dv,model), f)

In [68]:
with open(out, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [111]:
test = data.iloc[8].to_dict()