In [19]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.utils import resample

from imblearn.over_sampling import SMOTE

#importing file
churn = pd.read_csv(r"C:\Users\david\OneDrive\Ambiente de Trabalho\customer_churn_certoooo.csv")
churn.head(5)







Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [20]:
#We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen ,MonthlyCharges
churn=churn[['Churn','tenure','SeniorCitizen','MonthlyCharges']]
churn.head(3)

Unnamed: 0,Churn,tenure,SeniorCitizen,MonthlyCharges
0,No,1,0,29.85
1,No,34,0,56.95
2,Yes,2,0,53.85


In [21]:
#function to standardize headers

def standardize_headers(df):
    cols = []
    for i in range(len(df.columns)):
        cols.append(df.columns[i].lower().replace(' ','_'))
    df.columns = cols    
    return df#_data



In [22]:
churn=standardize_headers(churn)
churn.head(3)



Unnamed: 0,churn,tenure,seniorcitizen,monthlycharges
0,No,1,0,29.85
1,No,34,0,56.95
2,Yes,2,0,53.85


In [23]:
#one hot encoding churn

churn['churn']=pd.get_dummies(churn['churn'])['Yes']
churn.head(3)



Unnamed: 0,churn,tenure,seniorcitizen,monthlycharges
0,0,1,0,29.85
1,0,34,0,56.95
2,1,2,0,53.85


In [24]:
#checking NA's
def check_NA(df):
    print(df.isna().sum().sum())
    print("\n",df.isna().sum() / len(df))

In [25]:
#No NA's
check_NA(churn)



0

 churn             0.0
tenure            0.0
seniorcitizen     0.0
monthlycharges    0.0
dtype: float64


In [26]:
#counting churn/not churn 
churn['churn'].groupby(churn['churn']).count()

churn
0    5174
1    1869
Name: churn, dtype: int64

In [27]:
#Building and evaluating the logistic regression model.
X = churn.drop('churn', axis = 1)
y = churn['churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)

accuracy_model1 = round(LR.score(X_test, y_test),3)
precision_model1 = round(precision_score(y_test, pred),3)
recall_model1 = round(recall_score(y_test, pred),3)
print ("Accuracy is:",accuracy_model1)
print("Precision is: ",precision_model1)
print("Recall is: ",recall_model1)
print("F1 is: ", round(f1_score(y_test, pred),3))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Accuracy is: 0.806
Precision is:  0.691
Recall is:  0.48
F1 is:  0.566
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1036
           1       0.69      0.48      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.81      0.79      1409

[[956  80]
 [194 179]]


In [28]:
# balancing data using the SMOTE method
sm = SMOTE(random_state = 0, sampling_strategy=1) 
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train,y_train)
train_smote = pd.concat([X_train_SMOTE,y_train_SMOTE], axis = 1)



In [29]:
# verifying if data is balanced
train_smote['churn'].groupby(train_smote['churn']).count()



churn
0    4138
1    4138
Name: churn, dtype: int64

In [30]:
#Building and evaluating the logistic regression model.
LR = LogisticRegression()
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

accuracy_model2 = round(LR.score(X_test, y_test),3)
precision_model2 = round(precision_score(y_test, pred),3)
recall_model2 = round(recall_score(y_test, pred),3)
print ("Accuracy is:",accuracy_model2)
print("Precision is: ",precision_model2)
print("Recall is: ",recall_model2)
print("F1 is: ", round(f1_score(y_test, pred),3))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Accuracy is: 0.745
Precision is:  0.513
Recall is:  0.759
F1 is:  0.612
              precision    recall  f1-score   support

           0       0.89      0.74      0.81      1036
           1       0.51      0.76      0.61       373

    accuracy                           0.75      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.75      0.76      1409

[[767 269]
 [ 90 283]]


In [31]:
#normalization

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_churn = pd.DataFrame(scaler.fit_transform(churn),columns=churn.columns)
normalized_churn.head(3)



Unnamed: 0,churn,tenure,seniorcitizen,monthlycharges
0,0.0,0.013889,0.0,0.115423
1,0.0,0.472222,0.0,0.385075
2,1.0,0.027778,0.0,0.354229


In [None]:
#Building and evaluating the logistic regression model.
X = normalized_churn.drop('churn', axis = 1)
y = normalized_churn['churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)

accuracy_model3 = round(LR.score(X_test, y_test),3)
precision_model3 = round(precision_score(y_test, pred),3)
recall_model3 = round(recall_score(y_test, pred),3)
print ("Accuracy is:",accuracy_model3)
print("Precision is: ",precision_model3)
print("Recall is: ",recall_model3)
print("F1 is: ", round(f1_score(y_test, pred),3))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))