In [16]:
import imblearn
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report,accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

### 1 Load the dataset and explore the variables.

In [2]:
data=pd.read_csv(r"C:\Users\Quaresma\Documents\IRONHACK\Labs\10 lab-imbalanced-data\files_for_lab\customer_churn.csv")
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
#standardizing the columns on both data frames so they are organized the same way
col=[data.columns[i].lower().replace(" ","_") for i in range(len(data.columns))]
data.columns=col
data

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


### 2 We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [4]:
data= data[['tenure','seniorcitizen','monthlycharges','churn']]
data

Unnamed: 0,tenure,seniorcitizen,monthlycharges,churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.30,No
4,2,0,70.70,Yes
...,...,...,...,...
7038,24,0,84.80,No
7039,72,0,103.20,No
7040,11,0,29.60,No
7041,4,1,74.40,Yes


In [5]:
data['churn']=np.where(data['churn']== 'Yes',1,0).astype(int)

In [6]:
data.groupby('churn').count()

Unnamed: 0_level_0,tenure,seniorcitizen,monthlycharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5174,5174,5174
1,1869,1869,1869


### 3 Extract the target variable.

In [7]:
data_t= data['churn']

### 4 Extract the independent variables and scale them.

In [8]:
data_i= data.drop('churn', axis = 1)

#Scale the independent variables
scaler = MinMaxScaler()
# transform data
normalized_data = scaler.fit_transform(data_i)
normalized_data = pd.DataFrame(normalized_data, columns= data_i.columns)
normalized_data

Unnamed: 0,tenure,seniorcitizen,monthlycharges
0,0.013889,0.0,0.115423
1,0.472222,0.0,0.385075
2,0.027778,0.0,0.354229
3,0.625000,0.0,0.239303
4,0.027778,0.0,0.521891
...,...,...,...
7038,0.333333,0.0,0.662189
7039,1.000000,0.0,0.845274
7040,0.152778,0.0,0.112935
7041,0.055556,1.0,0.558706


### 5 Build the logistic regression model.

In [9]:
# Concat the data 
data_all=pd.concat([normalized_data,data_t], axis=1)
data_all

Unnamed: 0,tenure,seniorcitizen,monthlycharges,churn
0,0.013889,0.0,0.115423,0
1,0.472222,0.0,0.385075,0
2,0.027778,0.0,0.354229,1
3,0.625000,0.0,0.239303,0
4,0.027778,0.0,0.521891,1
...,...,...,...,...
7038,0.333333,0.0,0.662189,0
7039,1.000000,0.0,0.845274,0
7040,0.152778,0.0,0.112935,0
7041,0.055556,1.0,0.558706,1


In [22]:
#Lets do the train test split

X= data_all.drop('churn', axis = 1)
y= data_all['churn']



X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, test_size= 0.2)


In [14]:
LR= LogisticRegression(max_iter = 1000)
LR.fit(X_train,y_train)
LR.score(X_test,y_test)

pred= LR.predict(X_test) #here is predicting Y



### 6 Evaluate the model.

In [17]:
print("Accuracy is:", accuracy_score(y_test,pred))
print("Precision is:", precision_score(y_test,pred))
print("Recall is:", recall_score(y_test,pred))
print("F1 is:", f1_score(y_test,pred))
print("Classification report:")
print(classification_report(y_test,pred))


print("Confusion Matrix:")
confusion_matrix(y_test,pred)

Accuracy is: 0.8041163946061036
Precision is: 0.6932270916334662
Recall is: 0.46648793565683644
F1 is: 0.5576923076923076
Classification report:
              precision    recall  f1-score   support

           0       0.83      0.93      0.87      1036
           1       0.69      0.47      0.56       373

    accuracy                           0.80      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Confusion Matrix:


array([[959,  77],
       [199, 174]], dtype=int64)

### 7 Even a simple model will give us more than 70% accuracy. Why?

It give us more than 70% of accuracy because we are dealing with imbalanced data. So basicaly the model is going to be overfitted because of the "bias" do the majority class

### 8 Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points between existing points. Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [18]:
# By default the SMOUT method use the 5 Neareste neighbours
sm= SMOTE(random_state=0, sampling_strategy= 1.0 ) # we open SMOTE MODEL 
X_train_SMOTE, y_train_SMOTE= sm.fit_resample(X_train, y_train)

#Now we need to create the model 
train_smote= pd.concat([X_train_SMOTE,y_train_SMOTE], axis= 1)

LR= LogisticRegression(max_iter = 1000)
LR.fit(X_train_SMOTE,y_train_SMOTE)
LR.score(X_train_SMOTE,y_train_SMOTE)

pred= LR.predict(X_test) 

print("Accuracy is:", accuracy_score(y_test,pred))
print("Precision is:", precision_score(y_test,pred))
print("Recall is:", recall_score(y_test,pred))
print("F1 is:", f1_score(y_test,pred))
print("Classification report:")
print(classification_report(y_test,pred))


print("Confusion Matrix:")
confusion_matrix(y_test,pred)

Accuracy is: 0.7437899219304471
Precision is: 0.5107142857142857
Recall is: 0.7667560321715817
F1 is: 0.6130760986066451
Classification report:
              precision    recall  f1-score   support

           0       0.90      0.74      0.81      1036
           1       0.51      0.77      0.61       373

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.76      1409

Confusion Matrix:


array([[762, 274],
       [ 87, 286]], dtype=int64)

 The model model decreased the accuracy but increased the recall

In [19]:
#train_smote.groupby('churn').count()

Unnamed: 0_level_0,tenure,seniorcitizen,monthlycharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4138,4138,4138
1,4138,4138,4138
