In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
##For this churn problem - the data is being loaded from https://www.kaggle.com/blastchar/telco-customer-churn

file = '../WA_Fn-UseC_-Telco-Customer-Churn.csv'

churn_data = pd.read_csv(file)

In [13]:
churn_data.columns = [col.title() for col in churn_data.columns]

In [15]:
churn_data.columns

Index(['Customerid', 'Gender', 'Seniorcitizen', 'Partner', 'Dependents',
       'Tenure', 'Phoneservice', 'Multiplelines', 'Internetservice',
       'Onlinesecurity', 'Onlinebackup', 'Deviceprotection', 'Techsupport',
       'Streamingtv', 'Streamingmovies', 'Contract', 'Paperlessbilling',
       'Paymentmethod', 'Monthlycharges', 'Totalcharges', 'Churn'],
      dtype='object')

In [None]:
#Need to build a baseline first. If a customer is within 10 months of a tenure, with a monthly contract with > 80$ of 
#charges - its possible the customer is going to churn.

def baseline_pred(X,y=None,score=False):
    y_pred = np.zeroes(X.shape[0])
    
    for idx, x in X.iterrows():
        if(x.Contract == 'Month-to-month' &
           x.Tenure < 10 & x.Monthlycharges >= 80):
            y_pred.iloc[idx] = 1
            
    if score:
        if y == None:
            ValueError('Observed output cannot be empty for calculating score')
        else:
            hit_sum = [1 if y_in == y_hat else 0 for (y_in, y_hat) in zip(y,y_pred)].sum()
            print('accuracy is {.2f}'.format(hit_sum/X.shape[0]))
            

Selecting a logistic regression model first as its a simpler model. 

In [24]:
#converting to categorical data - general utility function
# converts object features with less than 10 unique values into categorical values
# converts these columns into dummy data, appends to the original data, drops the original category column

def transform_categorical_data(df, drop_orgnl = True, drop_first=True, convert_ascat = True):
    category_cols = [col for col in df.columns if (df[col].nunique() < 10)]
    
    #converting to categorical data makes the get_dummies run faster
    if convert_ascat == True:
        df[category_cols] = df[category_cols].astype('category')
        
    dummy_data = pd.get_dummies(df[category_cols],prefix=category_cols,drop_first=drop_first)
    #concatenate the dummy data and dataframe. Drop the original columns 
    
    df_cat = pd.concat([df,dummy_data],axis=1)
    if drop_orgnl == True:
        df_cat.drop(columns=category_cols, inplace = True)
    
    return df_cat

In [56]:
# need to generalize this function with any model and its additional parameters

def logitreg_traintestsplit(X, Y, size = 0.4, random_st = 16, solver='liblinear', c_vals=[0.10,1.0,100]):
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=size, random_state = random_st)
    
    score_df = DataFrame(columns=['C-Value','Training score','Testing score'])
    
    for c in c_vals:
        logreg = LogisticRegression(solver=solver, C=c).fit(X_train,y_train)
       
        score_df = score_df.append({'C-Value':c, 'Training score':logreg.score(X_train, y_train), 
                     'Testing score':logreg.score(X_test, y_test)}, ignore_index=True)
    
    return score_df, logreg

In [58]:
#applying logistic regression to whole data
churn_data['Totalcharges'] = pd.to_numeric(churn_data['Totalcharges'], errors='coerce').fillna(0)
churn_data_tf = transform_categorical_data(churn_data)

X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes


In [59]:
display(logitreg_traintestsplit(X,Y)[0])

Unnamed: 0,C-Value,Training score,Testing score
0,0.1,0.807574,0.803407
1,1.0,0.805917,0.803407
2,100.0,0.805917,0.804116


In [30]:
#Lets take out Total charges, Phone service, 
drop_cols = ['Phoneservice', 'Totalcharges']
churn_data_tf = transform_categorical_data(churn_data.drop(columns= drop_cols), convert_ascat = False)
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes
display(logitreg_traintestsplit(X,Y)[0])

Unnamed: 0,C-Value,Training score,Testing score
0,0.1,0.806154,0.801987
1,1.0,0.806391,0.800213
2,100.0,0.807574,0.801278


In [31]:
#dropping MultipleLines instead of phone service
drop_cols = ['Multiplelines', 'Totalcharges']
churn_data_tf = transform_categorical_data(churn_data.drop(columns= drop_cols), convert_ascat = False)
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes
display(logitreg_traintestsplit(X,Y)[0])

Unnamed: 0,C-Value,Training score,Testing score
0,0.1,0.807101,0.800568
1,1.0,0.807337,0.798439
2,100.0,0.807101,0.798793


In [35]:
#dropping streaming tv and streaming movies 
drop_cols += ['Streamingmovies', 'Streamingtv']
churn_data_tf = transform_categorical_data(churn_data.drop(columns= drop_cols), convert_ascat = False)
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes
display(logitreg_traintestsplit(X,Y)[0])

Unnamed: 0,C-Value,Training score,Testing score
0,0.1,0.804734,0.800213
1,1.0,0.806627,0.798793
2,100.0,0.806627,0.797019


Not much difference between C values, so taking C=1.0 lets look at confusion matrix for sensitivity and specificity

In [55]:
drop_cols

['Multiplelines', 'Totalcharges', 'Streamingmovies', 'Streamingtv']

In [68]:
#doing some RFE 
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

churn_data_tf = transform_categorical_data(churn_data, convert_ascat = False)
X, Y = churn_data_tf.iloc[:, 1:-1], churn_data_tf.Churn_Yes

X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.4, random_state=16)

logreg = LogisticRegression().fit(X_train, y_train)
selector = RFE(logreg, n_features_to_select=10).fit(X_train,y_train)

print('\nTraining score {} and testing score {}'.
      format(selector.score(X_train,y_train), selector.score(X_test,y_test)))

display(DataFrame({'Features':X_train.columns, 'Support':selector.support_, 
                   'Rank':selector.ranking_}).sort_values(by=['Rank']))




Training score 0.7810650887573964 and testing score 0.7778566359119943


Unnamed: 0,Features,Support,Rank
14,Onlinebackup_No internet service,True,1
20,Streamingtv_No internet service,True,1
19,Techsupport_Yes,True,1
15,Onlinebackup_Yes,True,1
13,Onlinesecurity_Yes,True,1
10,Internetservice_Fiber optic,True,1
22,Streamingmovies_No internet service,True,1
24,Contract_One year,True,1
25,Contract_Two year,True,1
27,Paymentmethod_Credit card (automatic),True,1


Results from RFE seem counter intuitive to observations. Eg. Tenure and charges doesnt seem to make into important features list. Could this be due to any less imbalance in data ? Or lack of proper features attributing to data ?

In [69]:
from sklearn.neighbors import KNeighborsClassifier

knn_neigh = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

print('\nTraining score {} and testing score {}'.
      format(knn_neigh.score(X_train,y_train), knn_neigh.score(X_test,y_test)))



Training score 0.8634319526627219 and testing score 0.7491128459900639


K nearest neighbours improved the bias but with a trade off of on regularization.

In [70]:
#lets use a non parametric model to see if the features preference gets changed.
from sklearn.ensemble import RandomForestClassifier

rnd_frst = RandomForestClassifier().fit(X_train,y_train)

print('\nTraining score {} and testing score {}'.
      format(knn_neigh.score(X_train,y_train), knn_neigh.score(X_test,y_test)))



Training score 0.8634319526627219 and testing score 0.7491128459900639


In [71]:
rnd_frst.n_features_

30

In [73]:
display(pd.DataFrame({'imp':rnd_frst.feature_importances_,'feat':X_train.columns}).sort_values(by=['imp']))

Unnamed: 0,imp,feat
16,0.001266,Deviceprotection_No internet service
11,0.001466,Internetservice_No
20,0.001573,Streamingtv_No internet service
14,0.003074,Onlinebackup_No internet service
7,0.003774,Phoneservice_Yes
8,0.0038,Multiplelines_No phone service
22,0.005226,Streamingmovies_No internet service
18,0.006203,Techsupport_No internet service
12,0.00729,Onlinesecurity_No internet service
29,0.012234,Paymentmethod_Mailed check


Though accuracy hasnt been significant the features importance is in relavance to observsations.