In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# 1.-Load the dataset and explore the variables.

In [2]:
c_churn = pd.read_csv("customer_churn.csv")
c_churn


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
list(c_churn.columns.values)

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [4]:
c_churn.columns=[e.lower().replace(' ', '_') for e in c_churn.columns] 
c_churn.columns


Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [5]:
c_churn.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

# Split the Dataset into X ('tenure', 'SeniorCitizen', 'MonthlyCharges') and y ('Churn')

In [6]:
X = c_churn.drop(columns=['customerid', 'gender', 'partner', 'dependents','phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'totalcharges', 'churn'])
y = c_churn['churn']

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
X_train_num = X_train.select_dtypes(include = np.number)


In [8]:
# Scaling data
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num)
X_train_normalized = transformer.transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_normalized)

X_train_norm

Unnamed: 0,0,1,2
0,1.0,0.125000,0.658209
1,0.0,0.333333,0.014428
2,0.0,0.888889,0.624876
3,0.0,0.527778,0.019403
4,0.0,0.222222,0.014428
...,...,...,...
5629,0.0,0.222222,0.310448
5630,0.0,1.000000,0.673134
5631,0.0,0.361111,0.015423
5632,0.0,0.555556,0.381592


In [9]:
X_train_norm.columns = X_train_norm.columns
X_train_norm

Unnamed: 0,0,1,2
0,1.0,0.125000,0.658209
1,0.0,0.333333,0.014428
2,0.0,0.888889,0.624876
3,0.0,0.527778,0.019403
4,0.0,0.222222,0.014428
...,...,...,...
5629,0.0,0.222222,0.310448
5630,0.0,1.000000,0.673134
5631,0.0,0.361111,0.015423
5632,0.0,0.555556,0.381592


In [10]:
X_train_transformed = X_train_norm

# Build the logistic regression model.

In [11]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(solver='lbfgs',
                  multi_class='auto').fit(X_train_norm, y_train) 

classification

LogisticRegression()

In [12]:
X_test_normalized = transformer.transform(X_test)
X_test_norm = pd.DataFrame(X_test_normalized)

In [13]:
X_test_norm.columns = X_test_norm.columns
X_test_norm

Unnamed: 0,0,1,2
0,0.0,0.013889,0.259701
1,0.0,0.138889,0.474129
2,0.0,0.125000,0.020896
3,0.0,0.986111,0.063682
4,0.0,0.208333,0.204478
...,...,...,...
1404,0.0,0.250000,0.018408
1405,0.0,0.694444,0.850249
1406,0.0,0.708333,0.066667
1407,0.0,0.027778,0.022886


In [14]:
list(X_train_norm.columns) == list(X_test_norm.columns)


True

# Evaluate the model.

In [15]:
predictions = classification.predict(X_test_norm)
classification.score(X_test_norm, y_test)

0.7863733144073811

In [16]:
pd.Series(predictions).value_counts()

No     1142
Yes     267
dtype: int64

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[937,  96],
       [205, 171]])

In [18]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = classification.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred, pos_label="Yes"))
print("recall: ",recall_score(y_test,pred, average="binary", pos_label="Yes"))
print("f1: ",f1_score(y_test,pred, pos_label="Yes"))

precision:  0.6404494382022472
recall:  0.45478723404255317
f1:  0.5318818040435459


# Even a simple model will give us more than 70% accuracy. Why?


In [39]:
# It is because the data base has 73% "no churn". Therefore even if we predict all results as "No Churn" will gave us 73% score.

# SMOTE

In [19]:
from imblearn.over_sampling import SMOTE

In [20]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_norm,y_train)

In [21]:
X_train_SMOTE.shape

(8282, 3)

In [22]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred, pos_label="Yes"))
print("recall: ",recall_score(y_test,pred, average="binary", pos_label="Yes"))
print("f1: ",f1_score(y_test,pred, pos_label="Yes"))

precision:  0.49818840579710144
recall:  0.7313829787234043
f1:  0.5926724137931035


In [23]:
confusion_matrix(y_test,pred)

array([[756, 277],
       [101, 275]])

In [24]:
(756+275)/len(pred)

0.7317246273953159

# Oversampling

In [25]:
from sklearn.utils import resample

In [26]:
y_train = y_train.reset_index(drop=True) 

y_test = y_test.reset_index(drop=True)

In [27]:
train = pd.concat([X_train_norm, y_train],axis=1)
train.head()

Unnamed: 0,0,1,2,churn
0,1.0,0.125,0.658209,Yes
1,0.0,0.333333,0.014428,No
2,0.0,0.888889,0.624876,No
3,0.0,0.527778,0.019403,No
4,0.0,0.222222,0.014428,No


In [28]:
train.isna().sum()

0        0
1        0
2        0
churn    0
dtype: int64

In [29]:
# separate majority/minority classes
no_churn = train[train['churn']=='No']
yes_churn = train[train['churn']=='Yes']

In [30]:
yes_churn.isna().sum()

0        0
1        0
2        0
churn    0
dtype: int64

In [31]:
display(no_churn.shape)
display(yes_churn.shape)

(4141, 4)

(1493, 4)

In [32]:
yes_churn_oversampled = resample(yes_churn, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no_churn),#<- make both sets the same size
                                    random_state=0)

In [33]:
# both sets are now of a reasonable size
display(no_churn.shape)
display(yes_churn_oversampled.shape)
yes_churn_oversampled.head(20)

(4141, 4)

(4141, 4)

Unnamed: 0,0,1,2,churn
2650,0.0,0.902778,0.868159,Yes
2208,0.0,0.125,0.265672,Yes
4666,0.0,0.583333,0.755721,Yes
3197,1.0,0.527778,0.387065,Yes
2933,0.0,0.041667,0.697512,Yes
5247,1.0,0.027778,0.510945,Yes
3978,0.0,0.25,0.774129,Yes
1083,1.0,0.236111,0.579104,Yes
2364,1.0,0.055556,0.558706,Yes
4236,0.0,0.75,0.867164,Yes


In [34]:
train_oversampled = pd.concat([no_churn,yes_churn_oversampled],axis=0)
train_oversampled.head()
train_oversampled.isna().sum()

0        0
1        0
2        0
churn    0
dtype: int64

In [35]:
y_train_over = train_oversampled['churn'].copy()
X_train_over = train_oversampled.drop('churn',axis = 1).copy()
y_test.isna().sum()

0

In [38]:
# Our Logistic Regression, while still not amazing, has improved substantially!
# especially at detecting instances of diabetes

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test_norm)

#LR = LogisticRegression(random_state=0, solver='lbfgs')
#LR.fit(X_train_SMOTE, y_train_SMOTE)
#pred = LR.predict(X_test_norm)


print("precision: ",precision_score(y_test,pred, pos_label="Yes"))
print("recall: ",recall_score(y_test,pred, average="binary", pos_label="Yes"))
print("f1: ",f1_score(y_test,pred, pos_label="Yes"))

precision:  0.49818840579710144
recall:  0.7313829787234043
f1:  0.5926724137931035


In [37]:

confusion_matrix(y_test,pred)

array([[756, 277],
       [101, 275]])

In [None]:

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))
