In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [3]:
##For this churn problem - the data is being loaded from https://www.kaggle.com/blastchar/telco-customer-churn

file = '../WA_Fn-UseC_-Telco-Customer-Churn.csv'

churn_data = pd.read_csv(file)

In [13]:
churn_data.columns = [col.title() for col in churn_data.columns]

In [15]:
churn_data.columns

Index(['Customerid', 'Gender', 'Seniorcitizen', 'Partner', 'Dependents',
       'Tenure', 'Phoneservice', 'Multiplelines', 'Internetservice',
       'Onlinesecurity', 'Onlinebackup', 'Deviceprotection', 'Techsupport',
       'Streamingtv', 'Streamingmovies', 'Contract', 'Paperlessbilling',
       'Paymentmethod', 'Monthlycharges', 'Totalcharges', 'Churn'],
      dtype='object')

Our baseline is going to be randomly assigning a class - 0/1 for No churn/Churn respectively

I am selecting three models here - linear regression for its class probability estimations, knn as it based on distances and random forest for its entropy. We can later make ensemble models combining them.

In [112]:
churn_data.dtypes

Customerid            object
Gender              category
Seniorcitizen       category
Partner             category
Dependents          category
Tenure                 int64
Phoneservice        category
Multiplelines       category
Internetservice     category
Onlinesecurity      category
Onlinebackup        category
Deviceprotection    category
Techsupport         category
Streamingtv         category
Streamingmovies     category
Contract            category
Paperlessbilling    category
Paymentmethod       category
Monthlycharges       float64
Totalcharges         float64
Churn               category
dtype: object

In [113]:
#reengineering some features to reduce the distance issues with some classification models.
churn_data['Tenure_in_yrs'] = churn_data['Tenure']//12
churn_data['Monthlycharges_100s'] = round(churn_data['Monthlycharges']/100,2)
churn_data['Totalcharges'] = pd.to_numeric(churn_data['Totalcharges'], errors='coerce').fillna(0)
churn_data['Totalcharges_1000s'] = round(churn_data['Totalcharges']/1000, 2)

In [None]:
#replace 'No Internet Service' and 'No Phone Service'  as No
churn_data = churn_data.replace{'No phone service':'No', 'No internet service':'No'}

In [None]:
def transform_categorical_data(df, cols, drop_orgnl = True, drop_first=True, convert_ascat = True):
    """
    Transform categorical data in the columns into dummy data using pandas.
    ----------
    df : Dataframe 
    cols : categorical columns to convert into.
    drop_orgnl : Boolean, if set to True drops the original column.
    drop_first : Boolean if True drops one of the dummy column
    convert_ascat : convert columns into categorical.
    
    Return
    -------
    Dataframe with the original frame concatenated with dummy data
    """

    if convert_ascat == True:
        df[cols] = df[cols].astype('category')
    
    dummy_data = pd.get_dummies(df[cols],prefix=cols,drop_first=drop_first)
    #concatenate the dummy data and dataframe. Drop the original columns 
    
    df_cat = pd.concat([df,dummy_data],axis=1)
    if drop_orgnl == True:
        df_cat.drop(columns=cols, inplace = True)
    
    return df_cat

In [None]:
churn_data_tf = transform_categorical_data(churn_data)

In [None]:
#creating a baseline predictor that randomly assigns a customer as churn or no churn
def baseline_pred(X, y_true):
    from numpy.random import rand, randint
    y_pred = [randint(0,2) for i in range(X.shape[0])]
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred)
    accuracy = round(tp/X.shape[0],3)
    print('baseline accuracy is {}'.format(accuracy))    

baseline_pred(X, Y)

In [152]:
#defining the metrics we want to use to compare for each model
def classification_metrics(y_true, y_pred, model, y_prob, ret_results = True, print_res = True):
    """
    Calculates different classification metrics for the model, and prints out the results
    ----------
    y_true : actual output values 
    y_pred : predicted values based on the model
    model : Classification model with already training data fitted on the model
    ret_results : To return different metrics as a dictionary
    print_res : prints out the metrics as well as roc curve.
    
    Return
    -------
    Returns a dictionary with different metrics if ret_results is set to True
    """
    import matplotlib.pyplot as plt
    
    #balanced accuracy score
    from sklearn.metrics import balanced_accuracy_score, confusion_matrix, accuracy_score, roc_curve, auc
    acc = round(accuracy_score(y_true, y_pred),3)
    bal_acc = round(balanced_accuracy_score(y_true, y_pred),3)
    #sensitivity and specificity - identifiying churning customers more important than stable customers
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = round(tp/(tp+fn), 3)
    specificity = round(tn/(tn+fp), 3)
    
    #auc score
    fpr, tpr, thresholds = roc_curve(y_true, y_prob[:,1])
    auc_score = round(auc(fpr, tpr), 3)
    res_dict = {'Accuracy':acc, 'Balanced acc':bal_acc, 'sensitivity': sensitivity, 
                'specificity':specificity, 'auc':auc_score}
    
    if print_res == True:
        print(model)
        print(res_dict)
        
        plt.grid()
        plt.plot(fpr,tpr)
        plt.fill_between(fpr, tpr, color = 'silver')
        plt.annotate('Area under Curve {}'.format(auc_score), (0,0.9), fontsize = 10)
        plt.title('ROC Curve')
        plt.show()
    
    if ret_results == True:
        return res_dict


In [151]:
# need to generalize this function with any model and its additional parameters


def model_CVsplit_metrics(X, Y, model, size = 0.4, random_st = 16):
    """
    Fits the classification model with a train test split and calculates and displays the metrics
    ----------
    X : input data set 
    Y : output labels 
    model : Classification model with parameters tuned.
    size : Size of the test for the train_test_split
    random_st : random state value
    
    Return
    -------
    Returns the model with fitted data
    """
    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=size, random_state = random_st)
    
    model = model.fit(X_train,y_train)
    
    test_score = model.score(X_test, y_test)
    train_score = model.score(X_train, y_train)
       
    print('test score {}'.format(test_score))
    print('training score {}'.format(train_score))
    
    y_hat = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    classification_metrics(y_true = y_test, y_pred=y_hat, model=model, y_prob=y_proba, ret_results= False)
    return model

In [None]:
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes

In [154]:
#applying logistic regression
logit = LogisticRegression(solver='liblinear')
logit = model_CVsplit_metrics(X, Y, model= logit)

ValueError: Found input variables with inconsistent numbers of samples: [5, 7043]

In [None]:
knn_model = KNeighborsClassifier(weights='distance')
knn_model = model_CVsplit_metrics(X,Y, knn_model)

In [None]:
rnd_forest = RandomForestClassifier()
rnd_forest = model_CVsplit_metrics(X,Y,rnd_forest)