In [1]:
# Installing all the necessary packages

In [2]:
!pip install imblearn



In [3]:
import imblearn
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder

In [4]:
# Importing the file I'm going to work with
data = pd.read_csv('files_for_lab/customer_churn.csv')
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [5]:
# Making column names lower case
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Exploring the data

In [7]:
# First, checking the types of the columns - we are going to predict the Churn from the only 3 numerical columns:
# - SeniorCitizen
# - tenure
# - MonthlyCharges
data.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [8]:
# Checking for NaNs - lucikly none!
data.isna().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [9]:
# Creating new dataframe using only the variables and target suggested
data1 = data[['seniorcitizen', 'tenure', 'monthlycharges', 'churn']]
data1

Unnamed: 0,seniorcitizen,tenure,monthlycharges,churn
0,0,1,29.85,No
1,0,34,56.95,No
2,0,2,53.85,Yes
3,0,45,42.30,No
4,0,2,70.70,Yes
...,...,...,...,...
7038,0,24,84.80,No
7039,0,72,103.20,No
7040,0,11,29.60,No
7041,1,4,74.40,Yes


In [10]:
for col in data1.columns:
    print(data1[col].value_counts())
    
# print(data1[col].value_counts()) for col in data1.columns     gives error

0    5901
1    1142
Name: seniorcitizen, dtype: int64
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64
20.05     61
19.85     45
19.95     44
19.90     44
20.00     43
          ..
23.65      1
114.70     1
43.65      1
87.80      1
78.70      1
Name: monthlycharges, Length: 1585, dtype: int64
No     5174
Yes    1869
Name: churn, dtype: int64


In [11]:
# Extract the target variable and the independent variables
X = pd.get_dummies(data1.drop('churn', axis=1))
y = data1['churn']


In [12]:
# Scale independent variables
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

scaled_X = scale.fit_transform(X)

In [13]:
# Build Logistic Regression model after train/test split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

model = LogisticRegression(random_state=40, max_iter=10000)
model.fit(X_train, y_train)

# Checking its score
# The score is already over 70% because of the imbalanced data. If a dataset is not balanced, even if predicting only 
# the majority value, it will have a good score.
predictions = model.predict(X_test)
model.score(X_test, y_test)

0.7785663591199432

In [17]:
# SMOTE
# Because the data is imbalanced, we're trying to make it more even with this method, by populating the dataset
# with data from the minority class.

from imblearn.over_sampling import SMOTE
smote = SMOTE()

X_train, y_train = smote.fit_resample(X_train, y_train)

y_train.value_counts()
# y_test.value_counts() # Remember: we DO NOT oversample the test data

No     4140
Yes    4140
Name: churn, dtype: int64

In [18]:
model = LogisticRegression(random_state = 40, max_iter = 1000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
model.score(X_test, y_test)

# The model was actually better before, with 7% better score

0.7083037615330021

In [24]:
# TOMEK LINKS

from imblearn.under_sampling import TomekLinks
tomek = TomekLinks()

X = pd.get_dummies(data1.drop('churn', axis=1))
y = data1['churn']

X_tl, y_tl = tomek.fit_resample(X, y)

y_tl.value_counts()

No     4712
Yes    1869
Name: churn, dtype: int64

In [25]:
model = LogisticRegression(random_state = 40, max_iter = 1000)
model.fit(X_tl, y_tl)

predictions = model.predict(X_tl)
model.score(X_tl, y_tl)

# Nice! 2% better score than the first model

0.791369092843033