In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

Load the dataset and explore the variables.

In [2]:
df =pd.read_csv('customer_churn.csv')

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


Using the Data

We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
Extract the target variable.

In [4]:
df = df[['tenure', 'SeniorCitizen','MonthlyCharges', 'Churn']]

In [5]:
y= df['Churn']
X= df.drop(['Churn'],axis=1)



In [6]:
y= y.replace('Yes',1)
y= y.replace('No',0)
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

Extract the independent variables and scale them.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [8]:
transformer = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)

X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-0.340191,-0.435476,-0.514314
1,0.88321,2.296336,0.01784
2,-1.196572,-0.435476,-0.819594
3,1.16867,-0.435476,-1.483535
4,-0.829552,-0.435476,0.658427


In [9]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True)

In [10]:
sm = SMOTE(random_state=100,k_neighbors=50)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

Build the logistic regression model.

In [11]:
LR = LogisticRegression(random_state=0, solver= 'saga')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.48175182481751827
recall:  0.712742980561555
f1:  0.5749128919860627


In [12]:
confusion_matrix(y_test,pred)

array([[943, 355],
       [133, 330]], dtype=int64)

In [13]:
train = pd.concat([X_train_scaled, y_train],axis=1)
train.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,-0.340191,-0.435476,-0.514314,0
1,0.88321,2.296336,0.01784,0
2,-1.196572,-0.435476,-0.819594,1
3,1.16867,-0.435476,-1.483535,0
4,-0.829552,-0.435476,0.658427,1


In [22]:
no_Churn = train[train['Churn']==0]
yes_Churn = train[train['Churn']==1]
display(no_Churn.shape)
display(yes_Churn.shape)

(3876, 4)

(1406, 4)

Oversampled

In [20]:
yes_Churn_oversampled = resample(yes_Churn, replace=True, n_samples = len(no_Churn), random_state=0) 

In [23]:
train_oversampled = pd.concat([no_Churn,yes_Churn_oversampled],axis=0)
train_oversampled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,-0.340191,-0.435476,-0.514314,0
1,0.88321,2.296336,0.01784,0
3,1.16867,-0.435476,-1.483535,0
5,1.04633,-0.435476,0.503285,0
6,-0.258631,-0.435476,0.7018,0


In [24]:
y_train_over = train_oversampled['Churn']
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()

In [25]:
LR = LogisticRegression(random_state=0, solver= 'lbfgs')
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4790764790764791
recall:  0.7170626349892009
f1:  0.57439446366782


In [26]:
confusion_matrix(y_test,pred)

array([[937, 361],
       [131, 332]], dtype=int64)

Undersample

In [27]:
no_Churn_undersampled = resample(no_Churn, #<- downsample from here
                                    replace=False, #<- no need to reuse data now, we have an abundance
                                    n_samples = len(yes_Churn),
                                    random_state=0)

In [28]:
train_undersampled = pd.concat([yes_Churn,no_Churn_undersampled])
train_undersampled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
2,-1.196572,-0.435476,-0.819594,1
4,-0.829552,-0.435476,0.658427,1
12,-1.115012,-0.435476,0.630068,1
13,-1.237352,-0.435476,0.199673,1
15,-1.155792,-0.435476,0.698464,1


In [29]:
y_train_under = train_undersampled['Churn']
X_train_under = train_undersampled.drop('Churn',axis = 1).copy()

In [30]:
LR = LogisticRegression(random_state=0, solver= 'lbfgs')
LR.fit(X_train_under, y_train_under)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.47701149425287354
recall:  0.7170626349892009
f1:  0.5729076790336497


In [31]:
confusion_matrix(y_test,pred)

array([[934, 364],
       [131, 332]], dtype=int64)