In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

### Load the dataset and explore the variables

In [13]:
df = pd.read_csv (r"C:\Users\fabi_\OneDrive\Estudos e Cursos\Data analytics\Ironhack.Labs\lab-imbalanced-data\files_for_lab\customer_churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [14]:
df.isna().sum().sum()

0

In [15]:
df.duplicated().sum()

0

In [16]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [17]:
df.columns = df.columns.str.lower()
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

## Predict variable Churn 

Use a logistic regression on variables Tenure, SeniorCitizen, MonthlyCharges

#### Extract the target variable

In [18]:
df.groupby('churn').count()
# Definition: Churn is a measurement of the percentage of accounts that cancel or choose not to renew their subscriptions. 

Unnamed: 0_level_0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
No,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174
Yes,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869


#### Extract the independent variables

In [20]:
churn = df[['churn', 'tenure', 'seniorcitizen','monthlycharges']]
churn

Unnamed: 0,churn,tenure,seniorcitizen,monthlycharges
0,No,1,0,29.85
1,No,34,0,56.95
2,Yes,2,0,53.85
3,No,45,0,42.30
4,Yes,2,0,70.70
...,...,...,...,...
7038,No,24,0,84.80
7039,No,72,0,103.20
7040,No,11,0,29.60
7041,Yes,4,1,74.40


#### X and y split

In [21]:
X = churn.drop('churn',axis = 1)
y = churn['churn']

## Build the logistic regression model

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)

LR.score(X_test, y_test)

0.8055358410220014

## Evaluate the model

In [29]:
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred, pos_label='Yes'))
print("recall: ",recall_score(y_test,pred, pos_label='Yes'))
print("f1: ",f1_score(y_test,pred, pos_label='Yes'))


precision:  0.6911196911196911
recall:  0.47989276139410186
f1:  0.5664556962025317


In [30]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          No       0.83      0.92      0.87      1036
         Yes       0.69      0.48      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.81      0.79      1409



Out of all customers predicted 'yes' to churn, 69% of the predictions were correct.

Out of all customers that actually left (yes), only 48% were caught.

In [31]:
confusion_matrix(y_test,pred)

array([[956,  80],
       [194, 179]], dtype=int64)

#### Even a simple model will give us more than 70% accuracy. Why?

Because the number of customers who don't leave is much higher. Since the data is imbalanced, the model can't clearly predict wich customer will leave.

## SMOTE

In [33]:
sm = SMOTE(random_state =0,sampling_strategy=1) # one means that I want a ratio of 1 between majority and abudance classe. # 0.5 Means my minority class will be half has big as my majority class
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

In [34]:
train_smote = pd.concat([X_train_SMOTE,y_train_SMOTE], axis = 1)
train_smote
train_smote.sum()

tenure                                                       231063
seniorcitizen                                                  1179
monthlycharges                                        562821.518704
churn             NoNoYesYesNoNoNoNoYesNoYesNoNoNoNoNoNoNoNoNoNo...
dtype: object

In [36]:
LR = LogisticRegression()
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred, pos_label='Yes'))
print("recall: ",recall_score(y_test,pred, pos_label='Yes'))
print("f1: ",f1_score(y_test,pred, pos_label='Yes'))

print(classification_report(y_test, pred))

precision:  0.5126811594202898
recall:  0.7587131367292225
f1:  0.6118918918918917
              precision    recall  f1-score   support

          No       0.89      0.74      0.81      1036
         Yes       0.51      0.76      0.61       373

    accuracy                           0.75      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.75      0.76      1409



Out of all customers that actually left (yes), now 76% were caught!!!

In [37]:
confusion_matrix(y_test,pred)

array([[767, 269],
       [ 90, 283]], dtype=int64)

767 customers were predicted to no_churn, and were really negative.

269 customers were predicted to yes_churn, but were actually negative.

The predictions above are not as precise as before. But this won't affect the business.

Now,

293 customers were predicted to yes_churn, and were really positive. This has improved considerably!

Only 90 customers were predicted to no_churn, but were actually positive. This has also improved considerably!