In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, OneSidedSelection

In [2]:
# loading the data
churn_data = pd.read_csv("Customer-Churn.csv")
churn_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [4]:
churn_data.TotalCharges.value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: TotalCharges, Length: 6531, dtype: int64

In [5]:
churn_data["TotalCharges"] = pd.to_numeric(churn_data.TotalCharges, errors ='coerce')

In [6]:
churn_data.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
churn_data['TotalCharges'] = churn_data['TotalCharges'].replace(np.nan, churn_data['TotalCharges'].mean())

In [8]:
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']

X = pd.DataFrame(data=churn_data, columns=features)
y = pd.DataFrame(data=churn_data, columns=['Churn'])

In [9]:
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.15, random_state = 1)

In [11]:
def model(scaler, model, X_train, X_test, y_train, y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model.fit(X_train,y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    print("Accuracy on train set: ", round(accuracy_score(y_train, pred_train),4))
    print("Accuracy on test set: ", round(accuracy_score(y_test, pred_test),4))
    print("F1 on train set: ", round(f1_score(y_train, pred_train),4))
    print("F1 on test set: ", round(f1_score(y_test, pred_test),4))

In [12]:
LogReg = LogisticRegression()
scaler = StandardScaler()

model(scaler, LogReg, X_train, X_test, y_train, y_test)

Accuracy on train set:  0.7872
Accuracy on test set:  0.8004
F1 on train set:  0.5354
F1 on test set:  0.5403


### Note: So far we have not balanced the data

#### Oversampling using SMOTE and ADASYN

In [13]:
churn_data['Churn'].value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [14]:
# this means that we have almost 4 times more "no" than "yes", so the data is imbalanced

In [15]:
smote = SMOTE(k_neighbors = 5)
X_train_SMOTE,y_train_SMOTE = smote.fit_resample(X_train,y_train)


adasyn = ADASYN(n_neighbors = 5)
X_train_ADASYN,y_train_ADASYN = adasyn.fit_resample(X_train,y_train)

In [16]:
model(scaler, LogReg, X_train_SMOTE, X_test, y_train_SMOTE, y_test)

Accuracy on train set:  0.7333
Accuracy on test set:  0.7171
F1 on train set:  0.7356
F1 on test set:  0.5635


In [17]:
model(scaler, LogReg, X_train_ADASYN, X_test, y_train_ADASYN, y_test)

Accuracy on train set:  0.7095
Accuracy on test set:  0.6821
F1 on train set:  0.7231
F1 on test set:  0.5544


In [18]:
# Both models gave have accuracy scores worse than the imbalanced model

#### Undersampling using TomekLinks and One Sided Selection

In [23]:
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train,y_train)

oss = OneSidedSelection(random_state=0)
X_train_oss, y_train_oss = oss.fit_resample(X_train,y_train)

In [24]:
model(scaler, LogReg, X_train_tomek, X_test, y_train_tomek, y_test)

Accuracy on train set:  0.7907
Accuracy on test set:  0.7786
F1 on train set:  0.6008
F1 on test set:  0.55


In [25]:
model(scaler, LogReg, X_train_oss, X_test, y_train_oss, y_test)

Accuracy on train set:  0.7903
Accuracy on test set:  0.7786
F1 on train set:  0.6008
F1 on test set:  0.55


In [None]:
# Both models perform a little better than the imbalanced model. 