In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

In [2]:
# Importing data
df_churn = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-imbalanced-data\files_for_lab\customer_churn.csv")
df_churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
# Dropping null values
df_churn = df_churn.dropna().reset_index(drop=True)
df_churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
# Transforming churn column in Yes==1 and NO==0
df_churn.loc[df_churn["Churn"] == "Yes", "Churn"] = 1
df_churn.loc[df_churn["Churn"] == "No", "Churn"] = 0
df_churn["Churn"] = df_churn["Churn"].astype("int")
df_churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,0
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,0
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,1


In [5]:
df_churn_model = df_churn[["tenure", "SeniorCitizen", "MonthlyCharges", "Churn"]]
df_churn_model

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,0
1,34,0,56.95,0
2,2,0,53.85,1
3,45,0,42.30,0
4,2,0,70.70,1
...,...,...,...,...
7038,24,0,84.80,0
7039,72,0,103.20,0
7040,11,0,29.60,0
7041,4,1,74.40,1


In [6]:
# Checking balance between 0 and 1
df_proportion = df_churn_model.groupby("Churn").count()
df_proportion

Unnamed: 0_level_0,tenure,SeniorCitizen,MonthlyCharges
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5174,5174,5174
1,1869,1869,1869


In [7]:
# X-y split
y = df_churn_model["Churn"]
X = df_churn_model.drop(["Churn"], axis = 1)
X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


In [8]:
# Normalizing X numerical values
scaler = MinMaxScaler() 
normalized_X = scaler.fit_transform(X) 
normalized_X = pd.DataFrame(normalized_X, columns = X.columns)
normalized_X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,0.013889,0.0,0.115423
1,0.472222,0.0,0.385075
2,0.027778,0.0,0.354229
3,0.625000,0.0,0.239303
4,0.027778,0.0,0.521891
...,...,...,...
7038,0.333333,0.0,0.662189
7039,1.000000,0.0,0.845274
7040,0.152778,0.0,0.112935
7041,0.055556,1.0,0.558706


In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, random_state = 0, test_size = 0.2)

# Creating the model
LR = LogisticRegression()
LR.fit(X_train, y_train)

# Computing indicatores
print("Accuracy:", LR.score(X_test, y_test))
pred = LR.predict(X_test)
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1:", f1_score(y_test, pred))

Accuracy: 0.7849538679914834
Precision: 0.6190476190476191
Recall: 0.4592391304347826
F1: 0.5273010920436817


In [10]:
# Classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1041
           1       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



In [11]:
# Confusion matrix
confusion_matrix(y_test, pred)

array([[937, 104],
       [199, 169]], dtype=int64)

In [12]:
# Even a simple model will give us more than 70% accuracy. Why?
# Because the data is imbalanced (more than 70% of no's)
proportion = df_proportion.iloc[0,0]/(df_proportion.iloc[0,0] + df_proportion.iloc[1,0])
proportion

0.7346301292063041

In [13]:
# Applying SMOTE
sm = SMOTE(random_state = 0, sampling_strategy = 1)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

train_smote = pd.concat([X_train_SMOTE, y_train_SMOTE], axis = 1)
train_smote

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1.000000,0.0,0.665174,0
1,0.194444,1.0,0.279602,0
2,0.986111,0.0,0.064179,0
3,0.458333,0.0,0.553731,1
4,0.652778,0.0,0.800995,1
...,...,...,...,...
8261,0.295692,1.0,0.521256,1
8262,0.472222,0.0,0.443004,1
8263,0.287489,1.0,0.483523,1
8264,0.155424,1.0,0.808766,1


In [14]:
# Creating the new model
LR = LogisticRegression()
LR.fit(X_train_SMOTE, y_train_SMOTE)

# Computing the new indicatores
print("Accuracy:", LR.score(X_test, y_test))
pred = LR.predict(X_test)
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1:", f1_score(y_test, pred))

Accuracy: 0.7203690560681334
Precision: 0.47644927536231885
Recall: 0.7146739130434783
F1: 0.5717391304347826


In [15]:
# New classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79      1041
           1       0.48      0.71      0.57       368

    accuracy                           0.72      1409
   macro avg       0.68      0.72      0.68      1409
weighted avg       0.77      0.72      0.73      1409



In [16]:
# New confusion matrix
confusion_matrix(y_test, pred)

array([[752, 289],
       [105, 263]], dtype=int64)

In [17]:
# Is it there any improvement?
# From all actual Yes's, this model was able to predict more Yes's than before (Recall increased).
print("Accuracy decreased")
print("Precision decreased")
print("Recall increased")
print("F1 increased")

Accuracy decreased
Precision decreased
Recall increased
F1 increased
