# Finding the Best Balancing Technique by Fitting a Classifier on the Telecom Churn Dataset

<b> Implement all the initial steps </b>

In [1]:
# import packages
import pandas as pd
import numpy as np
import smote_variants as sv
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/churn.csv'

In [3]:
df = pd.read_csv(url_path)
df.head()

Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,No,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,No,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,No,137,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,No,84,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,No,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


<b> Normalize the numerical raw data using the MinMaxScaler() function </b>

In [4]:
scaler = MinMaxScaler()

In [5]:
df['alScaled'] = scaler.fit_transform(df['accountlength'].values.reshape(-1,1))
df['nvmmScaled'] = scaler.fit_transform(df['numbervmailmessages'].values.reshape(-1,1))
df['tdmScaled'] = scaler.fit_transform(df['totaldayminutes'].values.reshape(-1,1))
df['tdcScaled'] = scaler.fit_transform(df['totaldaycalls'].values.reshape(-1,1))
df['tdchScaled'] = scaler.fit_transform(df['totaldaycharge'].values.reshape(-1,1))
df['temScaled'] = scaler.fit_transform(df['totaleveminutes'].values.reshape(-1,1))
df['tecScaled'] = scaler.fit_transform(df['totalevecalls'].values.reshape(-1,1))
df['techScaled'] = scaler.fit_transform(df['totalevecharge'].values.reshape(-1,1))
df['tnmScaled'] = scaler.fit_transform(df['totalnightminutes'].values.reshape(-1,1))
df['tncScaled'] = scaler.fit_transform(df['totalnightcalls'].values.reshape(-1,1))
df['tnchScaled'] = scaler.fit_transform(df['totalnightcharge'].values.reshape(-1,1))
df['timScaled'] = scaler.fit_transform(df['totalintlminutes'].values.reshape(-1,1))
df['ticScaled'] = scaler.fit_transform(df['totalintlcalls'].values.reshape(-1,1))
df['tichScaled'] = scaler.fit_transform(df['totalintlcharge'].values.reshape(-1,1))
df['ncscScaled'] = scaler.fit_transform(df['numbercustomerservicecalls'].values.reshape(-1,1))

In [6]:
df.drop(['accountlength','numbervmailmessages','totaldayminutes','totaldaycalls',\
                'totaldaycharge','totaleveminutes','totalevecalls','totalevecharge',\
                'totalnightminutes','totalnightcalls','totalnightcharge','totalintlminutes',\
                'totalintlcalls','totalintlcharge','numbercustomerservicecalls'], axis=1, inplace=True)

<b> Create dummy data for the categorical variables </b>

In [7]:
dfCat = pd.get_dummies(df[['internationalplan','voicemailplan']])

<b> Separate the numerical data from the original data frame </b>

In [8]:
dfNum = df[['alScaled','nvmmScaled','tdmScaled','tdcScaled',\
                'tdchScaled','temScaled','tecScaled','techScaled',\
                'tnmScaled','tncScaled','tnchScaled','timScaled',\
                'ticScaled','tichScaled','ncscScaled']]

<b> Concatenate numerical data and dummy categorical data </b>

In [9]:
X = pd.concat([dfCat, dfNum], axis=1)
Y = df['churn']

<b> Split the earlier dataset into train and test sets </b>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [11]:
trainData = pd.concat([X_train, y_train], axis=1)
trainData.head()

Unnamed: 0,internationalplan_no,internationalplan_yes,voicemailplan_no,voicemailplan_yes,alScaled,nvmmScaled,tdmScaled,tdcScaled,tdchScaled,temScaled,tecScaled,techScaled,tnmScaled,tncScaled,tnchScaled,timScaled,ticScaled,tichScaled,ncscScaled,churn
4036,1,0,0,1,0.256198,0.5,0.609388,0.484848,0.60927,0.695628,0.894118,0.695891,0.395949,0.622857,0.396173,0.515,0.1,0.514815,0.222222,No
2883,1,0,1,0,0.504132,0.0,0.595733,0.29697,0.595716,0.652736,0.688235,0.652863,0.60557,0.56,0.605515,0.49,0.55,0.490741,0.111111,No
4162,0,1,1,0,0.012397,0.0,0.482788,0.581818,0.482764,0.362387,0.552941,0.362342,0.620506,0.491429,0.620709,0.71,0.2,0.709259,0.0,Yes
4640,1,0,1,0,0.450413,0.0,0.714936,0.551515,0.714859,0.5697,0.558824,0.569719,0.63038,0.4,0.630838,0.645,0.1,0.644444,0.111111,Yes
2430,1,0,0,1,0.491736,0.769231,0.364438,0.6,0.364458,0.681056,0.458824,0.681009,0.50557,0.691429,0.505909,0.78,0.15,0.77963,0.0,No


<b> For the undersampling method, find the index of the minority class and separate the minority class. After that, sample the majority class and make the majority dataset equal to the minority class. Concatenate both the minority and under-sampled majority class to form a new dataset. Shuffle the dataset and separate the X and Y variables </b>

In [12]:
ind = trainData[trainData['churn'] == 'Yes'].index
print(len(ind))

490


In [13]:
minData = trainData.loc[ind]
minData.shape

(490, 20)

In [14]:
ind1 = trainData[trainData['churn'] == 'No'].index
majData = trainData.loc[ind1]
majData.shape

(3010, 20)

In [15]:
majSample = majData.sample(n=len(ind), random_state=123)

In [16]:
balData = pd.concat([minData, majSample], axis=0)
balData.shape

(980, 20)

In [17]:
balData = shuffle(balData)

In [18]:
X_trainNew = balData.iloc[:, 0:19]
y_trainNew = balData['churn']

print(X_trainNew.shape)
print(y_trainNew.shape)

(980, 19)
(980,)


<b> Fit a logistic regression model on the under-sampled dataset </b>

In [19]:
churnModel1 = LogisticRegression()
churnModel1.fit(X_trainNew, y_trainNew)

LogisticRegression()

<b> For the SMOTE method, create the oversamplers </b>

In [20]:
oversampler = sv.SMOTE()

In [21]:
X_train_smote, y_train_smote = oversampler.sample(np.array(X_train), np.array(y_train))

2021-03-06 00:33:43,340:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


<b> Fit a logistic regression model using SMOTE </b>

In [22]:
churnModel2 = LogisticRegression()
churnModel2.fit(X_train_smote, y_train_smote)

LogisticRegression()

<b> Import the smote-variant library and instantiate the MSMOTE algorithm </b>

In [23]:
oversampler = sv.MSMOTE()

<b> Create the oversampled data </b>

In [24]:
X_train_msmote, y_train_msmote = oversampler.sample(np.array(X_train), np.array(y_train))

2021-03-06 00:33:43,730:INFO:MSMOTE: Running sampling via ('MSMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


<b> Fit the logistic regression model using the MSMOTE dataset </b>

In [25]:
churnModel3 = LogisticRegression()
churnModel3.fit(X_train_msmote, y_train_msmote)

LogisticRegression()

<b> Generate the three separate predictions for each model </b>

In [26]:
pred_us = churnModel1.predict(X_test)
pred_smote = churnModel2.predict(X_test)
pred_msmote = churnModel3.predict(X_test)

<b> Generate separate accuracy metrics, classification reports, and confusion matrices </b>

In [27]:
print(f'Accuracy with undersampling: {churnModel1.score(X_test, y_test)}')
print(f'Accuracy with SMOTE: {churnModel2.score(X_test, y_test)}')
print(f'Accuracy with MSMOTE: {churnModel3.score(X_test, y_test)}')

Accuracy with undersampling: 0.7946666666666666
Accuracy with SMOTE: 0.7826666666666666
Accuracy with MSMOTE: 0.798


Metrics for Random Undersampling

In [28]:
print(confusion_matrix(y_test, pred_us))

[[1032  251]
 [  57  160]]


In [29]:
print(classification_report(y_test, pred_us))

              precision    recall  f1-score   support

          No       0.95      0.80      0.87      1283
         Yes       0.39      0.74      0.51       217

    accuracy                           0.79      1500
   macro avg       0.67      0.77      0.69      1500
weighted avg       0.87      0.79      0.82      1500



Metrics with SMOTE

In [30]:
print(confusion_matrix(y_test, pred_smote))

[[1009  274]
 [  52  165]]


In [31]:
print(classification_report(y_test, pred_smote))

              precision    recall  f1-score   support

          No       0.95      0.79      0.86      1283
         Yes       0.38      0.76      0.50       217

    accuracy                           0.78      1500
   macro avg       0.66      0.77      0.68      1500
weighted avg       0.87      0.78      0.81      1500



Metrics with MSMOTE

In [32]:
print(confusion_matrix(y_test, pred_msmote))

[[1034  249]
 [  54  163]]


In [33]:
print(classification_report(y_test, pred_msmote))

              precision    recall  f1-score   support

          No       0.95      0.81      0.87      1283
         Yes       0.40      0.75      0.52       217

    accuracy                           0.80      1500
   macro avg       0.67      0.78      0.70      1500
weighted avg       0.87      0.80      0.82      1500



<b> Analyze the results and select the best method </b>

From the recall values, we see that SMOTE has the largest value of 76%. This means that 76% of customers who are likely to churn have been correctly identified by the model. Random undersampling and MSMOTE have lower recall values of 73% and 75%, respectively. We now have a situation where MSMOTE has the highest accuracy but a slightly lower recall value and SMOTE has the lowest accuracy measure but the highest recall value. In such a situation, we have to look at the f1 scores, which is a weighted score between precision and recall. From all the f1 scores, we see that MSMOTE has the highest f1 score of 52%, with SMOTE and random undersampling scoring 50% each. Therefore, we can select MSMOTE as the best technique for balancing for this context.