### Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from scipy import stats

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [2]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
churnData[["TotalCharges"]] = churnData[["TotalCharges"]].apply(pd.to_numeric, errors='coerce')
churnData[["TotalCharges"]].dtypes

TotalCharges    float64
dtype: object

In [5]:
churnData.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [6]:
round(churnData.isnull().mean()*100,2).sort_values(ascending=False)

TotalCharges        0.16
gender              0.00
SeniorCitizen       0.00
Partner             0.00
Dependents          0.00
tenure              0.00
PhoneService        0.00
OnlineSecurity      0.00
OnlineBackup        0.00
DeviceProtection    0.00
TechSupport         0.00
StreamingTV         0.00
StreamingMovies     0.00
Contract            0.00
MonthlyCharges      0.00
Churn               0.00
dtype: float64

In [7]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean())

In [8]:
churnData['TotalCharges'].isnull().sum()

0

In [9]:
churnData.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [10]:
X = churnData[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

In [11]:
transformer = Normalizer() 
transformer.fit(X)
x_normalized = transformer.transform(X)
x_normalized.shape

(7043, 4)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [13]:
model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1025
         Yes       0.64      0.46      0.54       384

    accuracy                           0.78      1409
   macro avg       0.73      0.68      0.70      1409
weighted avg       0.77      0.78      0.77      1409



In [14]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [15]:
#upsampling with smote
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=100)

model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.74      0.74      0.74      1052
         Yes       0.73      0.73      0.73      1018

    accuracy                           0.73      2070
   macro avg       0.73      0.73      0.73      2070
weighted avg       0.73      0.73      0.73      2070



In [17]:
#downsampling with Tomeklinks
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X, y)
y_tl.value_counts()

No     4620
Yes    1869
Name: Churn, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=100)

model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.83      0.90      0.86       891
         Yes       0.73      0.59      0.65       407

    accuracy                           0.80      1298
   macro avg       0.78      0.74      0.76      1298
weighted avg       0.80      0.80      0.80      1298



In [19]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [20]:
#downsampling
category_no = churnData[churnData['Churn'] == 'No']
category_yes = churnData[churnData['Churn'] == 'Yes']

In [21]:
category_no = category_no.sample(len(category_yes))
print(category_no.shape)
print(category_yes.shape)

(1869, 16)
(1869, 16)


In [22]:
df_down = pd.concat([category_no, category_yes], axis=0)
df_down = df_down.sample(frac=1) #shuffle
df_down['Churn'].value_counts()

No     1869
Yes    1869
Name: Churn, dtype: int64

In [23]:
X = df_down[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y = df_down['Churn']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.77      0.70      0.73       377
         Yes       0.72      0.78      0.75       371

    accuracy                           0.74       748
   macro avg       0.74      0.74      0.74       748
weighted avg       0.74      0.74      0.74       748



In [25]:
#upsampling
category_no = churnData[churnData['Churn'] == 'No']
category_yes = churnData[churnData['Churn'] == 'Yes']
category_yes = category_yes.sample(len(category_no), replace=True)
print(category_no.shape)
print(category_yes.shape)

(5174, 16)
(5174, 16)


In [26]:
df_up = pd.concat([category_yes, category_no], axis=0)
df_up = df_up.sample(frac=1) #shuffle
df_up['Churn'].value_counts()

Yes    5174
No     5174
Name: Churn, dtype: int64

In [27]:
X = df_up[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y = df_up['Churn']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.73      0.67      0.70      1019
         Yes       0.70      0.77      0.73      1051

    accuracy                           0.72      2070
   macro avg       0.72      0.72      0.72      2070
weighted avg       0.72      0.72      0.72      2070



Conclusion: 
Tomeklinks seems to perform slighly better and is almost the same as the model without any sampling is applied, while other techniques are almost the same.