In [8]:
# Importing the required libraries 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import plot_confusion_matrix

In [2]:
#Reading that data into Python and calling the dataframe
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
#Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. thus, we have to Convert this column into numeric type using pd.to_numeric function

churnData.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [4]:
 
# Convert 'TotalCharges' column into numeric type using pd.to_numeric function.

churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors = 'coerce')

# converting churn to 0 and 1, logistic regression takes only numerics
churnData['Churn'] = churnData['Churn'].replace({'No': 0, 'Yes': 1})

In [23]:
#Checking for null values in the dataframe and Replacing the null values
churnData.isnull().sum()/len(data)  


gender              0.0
SeniorCitizen       0.0
Partner             0.0
Dependents          0.0
tenure              0.0
PhoneService        0.0
OnlineSecurity      0.0
OnlineBackup        0.0
DeviceProtection    0.0
TechSupport         0.0
StreamingTV         0.0
StreamingMovies     0.0
Contract            0.0
MonthlyCharges      0.0
TotalCharges        0.0
Churn               0.0
dtype: float64

In [5]:
# Check for null values in the dataframe. Replace the null values.
# churnData.isna().sum() # only total charges has null values
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean())
# Replaced null values with mean

In [9]:
# Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:
numerical = churnData._get_numeric_data()
X = numerical.drop(labels='Churn', axis=1)
y = numerical['Churn']


# Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Scale the features either by using normalizer or a standard scaler.
transformer = MinMaxScaler().fit(X_train)
X_normalized_tr = transformer.transform(X_train)
X_train_normalized = pd.DataFrame(X_normalized_tr, columns=X_train.columns)

transformer = MinMaxScaler().fit(X_test)
X_normalized_te = transformer.transform(X_test)
X_test_normalized = pd.DataFrame(X_normalized_te, columns=X_test.columns)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# Fit a logistic regression model on the training data.
model_a = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train_normalized, y_train)


# Check the accuracy on the test data.
pred = model_a.predict(X_test_normalized)
print("accuracy: ",model_a.score(X_test_normalized, y_test))

accuracy:  0.8041163946061036


# Managing imbalance in the dataset

In [11]:
# Check for the imbalance in the dataset.
# Checking confusion matrix
display(confusion_matrix(y_test, pred))
# accuraccy is 80% but we have a lot of imbalances in the data, to imporve that i am usi

array([[960,  76],
       [200, 173]], dtype=int64)

# Oversampling

In [12]:
train = pd.concat([X_train_normalized, y_train],axis=1)
train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.291667,0.464375,0.152089,0
1,0.0,0.75,0.786746,0.589736,0
2,0.0,0.013889,0.051819,0.000537,1
3,0.0,0.055556,0.517688,0.025288,1
4,0.0,0.0,0.434978,0.261309,0


In [13]:
from sklearn.utils import resample

churn_0 = train[train['Churn'] == 0]
churn_1 = train[train['Churn'] == 1]

In [14]:
churn_1_oversampled = resample(churn_1,replace=True,n_samples = len(churn_0))
print(churn_0.shape)
print(churn_1_oversampled.shape)

(4138, 5)
(4138, 5)


In [16]:
data_upsampled = pd.concat([churn_0, churn_1_oversampled], axis=0)
data_upsampled['Churn'].value_counts()


y_train_over = data_upsampled['Churn']
X_train_over = data_upsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [17]:
LR_over = LogisticRegression(max_iter=1000)
LR_over.fit(X_train_over, y_train_over)
pred_over = LR_over.predict(X_test_normalized)
print("accuracy: ",LR_over.score(X_test_normalized, y_test))
print("precision: ",precision_score(y_test,pred_over))
print("recall: ",recall_score(y_test,pred_over))
print("f1: ",f1_score(y_test,pred_over))

confusion_matrix(y_test,pred_over)

accuracy:  0.3179559971611072
precision:  0.23321234119782214
recall:  0.6890080428954424
f1:  0.34847457627118644


array([[191, 845],
       [116, 257]], dtype=int64)

# Under sampling

In [18]:
churn_0_undersampled = resample(churn_0, replace=False,n_samples = len(churn_1))
print(churn_0_undersampled.shape)
print(churn_1.shape)

(1496, 5)
(1496, 5)


In [20]:
data_downsampled = pd.concat([churn_0_undersampled, churn_1], axis=0)
data_downsampled['Churn'].value_counts()

y_train_under = data_downsampled['Churn']
X_train_under= data_downsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [21]:
from sklearn.linear_model import LogisticRegression

LR2 = LogisticRegression(max_iter=1000)
LR2.fit(X_train_under, y_train_under)
pred2 = LR2.predict(X_test_normalized)
print("accuracy: ",LR2.score(X_test_normalized, y_test))
print("precision: ",precision_score(y_test,pred2))
print("recall: ",recall_score(y_test,pred2))
print("f1: ",f1_score(y_test,pred2))
confusion_matrix(y_test,pred2)

accuracy:  0.3747338537970192
precision:  0.24701195219123506
recall:  0.6648793565683646
f1:  0.36020334059549747


array([[280, 756],
       [125, 248]], dtype=int64)

# SMOTE

In [22]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_normalized, y_train)
X_train_SMOTE.shape

from sklearn.linear_model import LogisticRegression

LR4 = LogisticRegression(max_iter=1000)
LR4.fit(X_train_SMOTE, y_train_SMOTE)
pred3 = LR4.predict(X_test_normalized)
print("accuracy: ",LR4.score(X_test_normalized, y_test))
print("precision: ",precision_score(y_test,pred3))
print("recall: ",recall_score(y_test,pred3))
print("f1: ",f1_score(y_test,pred3))

confusion_matrix(y_test,pred3)

accuracy:  0.7452093683463449
precision:  0.512589928057554
recall:  0.7640750670241286
f1:  0.6135629709364908


array([[765, 271],
       [ 88, 285]], dtype=int64)