In [2]:
import pandas as pd
import numpy as np
# filter warnings
import warnings
warnings.filterwarnings("ignore")
from  sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

In [3]:
# Executing datamerge function, to create final dataset
%run ./datamerge.py

In [4]:
final_telco_df =mergeData() 

In [5]:
final_telco_df.head()

Unnamed: 0,Customer ID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Satisfaction Score,Customer Status,Churn Label,Churn Value,Churn Score,CLTV,Churn Category,Churn Reason,ID,Population
0,8779-QRDMV,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Male,...,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data,21,68701
1,4737-AQCPU,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Male,...,3,Stayed,No,0,42,4658,,,21,68701
2,5043-TRZWM,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Female,...,5,Joined,No,0,34,3503,,,21,68701
3,8165-CBKXO,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Male,...,3,Stayed,No,0,46,5748,,,21,68701
4,9979-RGMZT,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Female,...,3,Stayed,No,0,38,5545,,,21,68701


In [6]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(final_telco_df.loc[:, final_telco_df.columns != 'Churn Label']
                                                , final_telco_df[['Churn Label']], test_size=0.1, random_state=42)

In [7]:
Ytrain.columns

Index(['Churn Label'], dtype='object')

# Feature engineering and processing on Train data

In [8]:
# Since Count, Country ,State and Quarter are constant, we are dropping them from further analysis. 
def dropfeatures(df):
    '''
    Dropping constant columns
    '''
    df = df.drop(['Count','Country','State','Quarter'],axis=1)
    return df

In [9]:
def regionType(df):
    '''
    Categorizing different cities into 4 major sub groups based on population
    '''
    firstq = df[['Population']].describe().loc['25%'].values[0]
    meanpop = df[['Population']].describe().loc['mean'].values[0]
    thirdq = df[['Population']].describe().loc['75%'].values[0]
    df['RegionType']= df['Population'].apply(lambda x: 'Small Urban' if x < firstq
                          else ('Medium Sized Urban' if (x >= firstq and x  < meanpop)else('Metropolitan' if 
                          (x >= meanpop and x  < thirdq) else 'Large Metropolitan')))                                                                                                                             
    return df

# Part Worths of Charges for each utility

In [10]:
def partworthutility(df):
    '''
    Calculating percentage contributions by each subscription serivce type to the total Monthly charges
    '''
    # Total Charges here refer to Monthly Charge * Tenure in Months
    df['Total_Charges_Allservices']  = (df['Monthly Charge']*df['Tenure in Months'])+df['Total Long Distance Charges']+df['Total Extra Data Charges']
    df['Perc_Services'] = df['Total Charges']/df['Total_Charges_Allservices']
    df['Perc_LongDistance'] =df['Total Long Distance Charges']/df['Total_Charges_Allservices']
    df['Perc_AdditionalData'] =df['Total Extra Data Charges']/df['Total_Charges_Allservices']
    return df

# Engagement Features

In [11]:
names = ['Phone Service','Online Security', 'Online Backup', 'Device Protection Plan','Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data','Internet Service','Multiple Lines','Premium Tech Support','Paperless Billing',
         'Under 30','Senior Citizen', 'Married', 'Dependents','Referred a Friend']  
# ConvertValues(names,Xtrain)

In [12]:
def ConvertValues(names,df):
    for i in names:
        df[i] = np.where(df[i] =='Yes',1,0)
    return df

In [20]:
def engagement(df):
    '''
    Function to calculate the number of services under each category a customer has subscribed to
    '''
    # Total Charges here refer to Monthly Charge * Tenure in Months
    df = ConvertValues(names,df)
    df['No_of_Phone_Services'] = df['Phone Service']+df['Multiple Lines']
    df['No_of_Security_Services'] = df['Online Security']+df['Online Backup']+df['Device Protection Plan']+df['Premium Tech Support']
    df['No_of_Media_Services'] = df['Streaming TV']+df['Streaming Movies'] + df['Streaming Music']
    df['No_of_Internet_Services'] = df['Unlimited Data']+df['Internet Service']
    return df

# Binning Age Variable

Creating Additional Age bins

Under 30 - Already exists

30 to 45 - Young family

45 to 60 - Grown family

more than 60 - Senior Citizen - Already exists

In [14]:
def agebuckets(df):
    df['Young_Family'] = np.where((df['Age'] >= 30) & (df['Age'] <45),1,0)
    df['Grown_Family'] = np.where((df['Age'] >= 45) & (df['Age'] <60),1,0)    
    return df

In [15]:
train_columns = ['Gender','Under 30','Senior Citizen', 'Married', 'Dependents','Referred a Friend','Tenure in Months', 'Offer',
                'Avg Monthly GB Download','Internet Type','Contract','Paperless Billing','Payment Method','RegionType','Perc_Services', 
                 'Perc_LongDistance', 'Perc_AdditionalData','No_of_Phone_Services', 'No_of_Security_Services','No_of_Media_Services', 
                 'No_of_Internet_Services', 'Young_Family','Grown_Family','Satisfaction Score','Total Revenue']

In [17]:
def dummies_cat(df):
    df_cat = df[['Gender','Internet Type','Contract','Payment Method','RegionType','Offer']]
    df_cat = pd.get_dummies(df_cat)
    df1 = pd.concat([df.loc[:, ~df.columns.isin(['Gender','Internet Type','Contract','Payment Method','RegionType','Offer'])], df_cat],axis=1)
    return df1

In [21]:
def featureengg(df):
    df = dropfeatures(df)
    df = regionType(df)
    df = partworthutility(df)
    df = engagement(df)
    df = agebuckets(df)
    df = df[train_columns]
    df = dummies_cat(df)
    return df  

In [22]:
Xtrain1 = Xtrain.copy()
Xtrain = featureengg(Xtrain)

In [23]:
Xtrain.head()

Unnamed: 0,Under 30,Senior Citizen,Married,Dependents,Referred a Friend,Tenure in Months,Avg Monthly GB Download,Paperless Billing,Perc_Services,Perc_LongDistance,...,RegionType_Large Metropolitan,RegionType_Medium Sized Urban,RegionType_Metropolitan,RegionType_Small Urban,Offer_None,Offer_Offer A,Offer_Offer B,Offer_Offer C,Offer_Offer D,Offer_Offer E
1554,1,0,1,0,1,39,19,1,0.661743,0.318056,...,1,0,0,0,0,0,0,1,0,0
4817,1,0,1,1,1,41,0,0,0.329803,0.669659,...,0,0,1,0,0,0,1,0,0,0
5957,1,0,0,0,0,3,59,0,0.900768,0.0,...,0,0,1,0,1,0,0,0,0,0
5123,0,0,0,0,0,1,28,1,0.943112,0.056888,...,0,0,0,1,1,0,0,0,0,0
3217,0,0,0,0,0,1,22,0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,1


In [24]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6338 entries, 1554 to 860
Data columns (total 41 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Under 30                        6338 non-null   int32  
 1   Senior Citizen                  6338 non-null   int32  
 2   Married                         6338 non-null   int32  
 3   Dependents                      6338 non-null   int32  
 4   Referred a Friend               6338 non-null   int32  
 5   Tenure in Months                6338 non-null   int64  
 6   Avg Monthly GB Download         6338 non-null   int64  
 7   Paperless Billing               6338 non-null   int32  
 8   Perc_Services                   6338 non-null   float64
 9   Perc_LongDistance               6338 non-null   float64
 10  Perc_AdditionalData             6338 non-null   float64
 11  No_of_Phone_Services            6338 non-null   int32  
 12  No_of_Security_Services         

# Model Building

In [25]:
# Baseline Model - Logistic Regression
model = LogisticRegression()
print(model.fit(Xtrain, Ytrain))
print(model.score(Xtrain, Ytrain))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
0.9529820132533923


In [27]:
# performing KFold cross validation with 10 splits
scores = cross_val_score(model, Xtrain, Ytrain, cv=10)
print(scores.mean())
print(scores.std())

0.9514041094183723
0.006764941698987443


In [28]:
model1 = RandomForestClassifier()
print(model1.fit(Xtrain, Ytrain))
print(model1.score(Xtrain, Ytrain))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.995582202587567


In [33]:
# Naive-Bayes Classifier
model2 = GaussianNB()
print(model2.fit(Xtrain, Ytrain))
print(model2.score(Xtrain, Ytrain))

GaussianNB(priors=None, var_smoothing=1e-09)
0.8505837803723572


# Hyper-paramter tuning for Random Forest Model

In [30]:
from sklearn.model_selection import GridSearchCV
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperRF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(model1, hyperRF, cv = 5, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(Xtrain, Ytrain)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 45.4min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 46.6min finished


Best Model

In [31]:
bestF.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
bestF.best_score_

0.9607131587251498

# Running on model Hold out test data

In [36]:
Xtest=featureengg(Xtest)
ypred = bestF.predict(Xtest)
print(classification_report(Ytest,ypred))

              precision    recall  f1-score   support

          No       0.94      0.98      0.96       534
         Yes       0.93      0.81      0.87       171

    accuracy                           0.94       705
   macro avg       0.94      0.89      0.91       705
weighted avg       0.94      0.94      0.94       705

