In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

from sklearn.linear_model import LogisticRegression


In [2]:
##For this churn problem - the data is being loaded from https://www.kaggle.com/blastchar/telco-customer-churn

file = './WA_Fn-UseC_-Telco-Customer-Churn.csv'

churn_data = pd.read_csv(file)

In [5]:
churn_data.dtypes

customerID            object
gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges          object
Churn               category
dtype: object

In [None]:
#Need to build a baseline too 

Selecting a logistic regression model here as its a simpler model. 

In [27]:
#converting to categorical data - general utility function
# converts object features with less than 10 unique values into categorical values
# converts these columns into dummy data, appends to the original data, drops the original category column

def transform_categorical_data(df, drop_orgnl = True, drop_first=True):
    category_cols = [col for col in df.columns if (df[col].nunique() < 10)]
    #print('these are the category cols {}'.format(category_cols))
    #converting to categorical data makes the get_dummies run faster
    df[category_cols] = df[category_cols].astype('category')
    dummy_data = pd.get_dummies(df[category_cols],prefix=category_cols,drop_first=drop_first)
    #concatenate the dummy data and dataframe. Drop the original columns 
    
    df = pd.concat([df,dummy_data],axis=1)
    if drop_orgnl == True:
        df.drop(columns=category_cols, inplace = True)
    
    return df

In [10]:
churn_data_tf = transform_categorical_data(churn_data)

churn_data_tf.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,7590-VHVEG,0,1,29.85,29.85,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0
1,5575-GNVDE,0,34,56.95,1889.5,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2,3668-QPYBK,0,2,53.85,108.15,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,1
3,7795-CFOCW,0,45,42.3,1840.75,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,9237-HQITU,0,2,70.7,151.65,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1


In [6]:
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes

In [9]:
X['TotalCharges']= pd.to_numeric(X.TotalCharges, errors = 'coerce').fillna(0)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.4, random_state = 16)


def logitregression_model(X_df,y_df, c_values, solver='liblinear'):
    score_df = DataFrame(columns=['C-Value','Training score','Testing score'])
    
    for c in c_values:
        logreg = LogisticRegression(solver=solver, C=c).fit(X_df[0],y_df[0])
       
        score_df = score_df.append({'C-Value':c, 'Training score':logreg.score(X_df[0],y_df[0]), 
                     'Testing score':logreg.score(X_df[1],y_df[1])}, ignore_index=True)
    
    return score_df

print(logitregression_model(X_df =[X_train,X_test], y_df = [y_train,y_test], c_values = [0.01,0.1, 1, 10, 20, 30, 100]))


   C-Value  Training score  Testing score
0     0.01        0.804734       0.803407
1     0.10        0.807574       0.803407
2     1.00        0.806154       0.803407
3    10.00        0.805917       0.804116
4    20.00        0.805917       0.804116
5    30.00        0.805917       0.804116
6   100.00        0.805917       0.804116


The optimum C value for best training and test score seems to be C = 10 with 80.5% and 80.3% accuracy respectively. The accuracy scores are very close, so this could be an underfitting here.

In [13]:
#lets drop total charges , as its correlated with tenure and monthly charges.
X.drop(columns='TotalCharges',inplace=True)

KeyError: "['TotalCharges'] not found in axis"

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.4, random_state = 16)

print(logitregression_model(X_df =[X_train,X_test], y_df = [y_train,y_test], c_values = [0.01,0.1, 1, 10, 20, 30, 100]))


   C-Value  Training score  Testing score
0     0.01        0.805444       0.802697
1     0.10        0.805207       0.801632
2     1.00        0.806154       0.800213
3    10.00        0.808047       0.801987
4    20.00        0.806627       0.801278
5    30.00        0.807574       0.801278
6   100.00        0.806864       0.801632


Performance is slightly improved for training score. For C=10 its 80.8% training score & 80.19% testing score. Bias is improved but variance is reduced. 

In [22]:
#Lets try recursive feature elimation
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.4, random_state=16)

logreg = LogisticRegression(C=10).fit(X_train, y_train)
selector = RFE(logreg, n_features_to_select=7).fit(X_train,y_train)
                                                    
print('feature mask ::  {} \n Ranking of feature {}'.format(selector.support_, selector.ranking_))
#print('\ngrid score :: {}'.format(selector.grid_scores_))
print('\nTraining score {} and testing score {}'.format(selector.score(X_train,y_train), selector.score(X_test,y_test)))
                                                    

feature mask ::  [False False False False False False False False False  True False False
  True False False False False False False  True False  True False  True
  True False  True False False] 
 Ranking of feature [14 18 21 22 23  6  8 17 16  1 10  7  1  3  2 12 19 13  4  1 15  1 11  1
  1  9  1  5 20]

Training score 0.7801183431952663 and testing score 0.7775017743080199


In [32]:
churn_data_ohe = transform_categorical_data(churn_data, drop_orgnl= True, drop_first=False)
churn_data_ohe.drop(columns='TotalCharges', inplace = True)

X_ohe, Y_ohe = churn_data_ohe.iloc[:, 1:-1], churn_data_ohe.Churn_Yes

#drop Churn_no
X_ohe.drop(columns='Churn_No', inplace = True)

In [35]:
def logitreg_RFE(X,Y, test_size=0.4, rndn_state=16, num_features = 7):
    X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=test_size, random_state=rndn_state)
    
    logreg = LogisticRegression(C=10).fit(X_train, y_train)
    selector = RFE(logreg, n_features_to_select=7, ke).fit(X_train,y_train)
                                                
    print('feature mask ::  {} \n Ranking of feature {}'.format(selector.support_, selector.ranking_))
    print('\nTraining score {} and testing score {}'.format(selector.score(X_train,y_train), selector.score(X_test,y_test)))
    return (logreg, selector)

logreg, sel_rfe = logitreg_RFE(X_ohe, Y_ohe )

feature mask ::  [False False False False False False False False False False False False
 False False False  True False False False False  True False False False
 False False False False  True False False  True False False  True False
  True False  True False False False False False False] 
 Ranking of feature [36 30 33 34 38 16 11 31 17 37 19 10 35 15 25  1 14  9 23  8  1 27  6  5
 24  3 18 29  1  7 21  1 22 13  1 26  1 32  1 39 12  4  2 20 28]

Training score 0.77301775147929 and testing score 0.7792760823278921


In [38]:
X_train.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'gender_Male',
       'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [43]:
#lets try SVC - support vector classfier
from sklearn.svm import SVC
svc_model = SVC(kernel ='rbf').fit(X_train,y_train)
print('training score {}'.format(svc_model.score(X_train,y_train)))
print('testing score {}'.format(svc_model.score(X_test,y_test)))

training score 0.8168047337278107
testing score 0.7945351312987935
