In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [26]:
churnData = pd.read_csv('/Users/kb/Ironhack/Lesson/data_3.09_activities/customer_churn.csv')
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [27]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

As you can see there is a huge imbalance in the representation of the two categories 

In [28]:
numericData = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
Y = pd.DataFrame(data=churnData, columns=['Churn'])
transformer = StandardScaler().fit(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(scaled_x, churnData['Churn'])
classification.score(scaled_x, churnData['Churn'])

0.7911401391452506

With this imbalance, when we tried to fit the model (without any data cleaning) it still gave us an accuracy of nearly 80%

Note: Even if we just blindly say that each prediction is No, we would still get an accuracy of 

In [29]:
5174/(5174+1869)

0.7346301292063041

In [30]:
# Lets increase the imbalance and see how the model works 

In [31]:
yes = churnData[churnData['Churn']=='Yes']
no = churnData[churnData['Churn']=='No']
yes = yes.sample(500)

In [34]:
yes = churnData[churnData['Churn']=='Yes']
data1=yes
data2=no.sample(churnData['Churn'].value_counts()[1])

In [35]:
data1.shape

(1869, 21)

In [36]:
data2.shape

(1869, 21)

In [37]:
data_final=pd.concat([data1,data2], axis =0)

In [38]:
data_final.shape

(3738, 21)

In [39]:
yes = churnData[churnData['Churn']=='Yes']
no = churnData[churnData['Churn']=='No']

In [40]:
yesinc = yes.sample(5174, replace = True)

In [11]:
data = pd.concat([yes,no], axis=0)
print(data['Churn'].value_counts())
data.head()

No     5174
Yes     500
Name: Churn, dtype: int64


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
973,8919-FYFQZ,Male,1,Yes,No,44,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,30.35,1359.7,Yes
4230,5108-ADXWO,Male,0,No,No,11,Yes,No,Fiber optic,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,73.5,791.75,Yes
6496,2474-BRUCM,Male,1,Yes,No,40,Yes,Yes,Fiber optic,Yes,...,No,No,Yes,Yes,Month-to-month,No,Electronic check,101.85,4086.3,Yes
4480,7908-QCBCA,Female,0,Yes,No,1,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,69.4,69.4,Yes
680,1448-PWKYE,Male,0,Yes,Yes,1,Yes,No,Fiber optic,No,...,No,No,No,Yes,Month-to-month,No,Electronic check,80.0,80.0,Yes


In [12]:
#shuffling the data
data = data.sample(frac=1)
data['Churn'].value_counts()

No     5174
Yes     500
Name: Churn, dtype: int64

In [13]:
numericData = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(data[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(data[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(scaled_x, data['Churn'])
classification.score(scaled_x, data['Churn'])

0.912407472682411

In [14]:
counts = churnData['Churn'].value_counts()
yes = churnData[churnData['Churn']=='Yes'].sample(counts[0], replace=True)
no = churnData[churnData['Churn']=='No']
data = pd.concat([yes,no], axis=0)
data = data.sample(frac=1)
data['Churn'].value_counts()

Yes    5174
No     5174
Name: Churn, dtype: int64

In [15]:
counts = churnData['Churn'].value_counts()
counts

No     5174
Yes    1869
Name: Churn, dtype: int64

In [16]:
X = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, data['Churn'])
classification.score(X, data['Churn'])

0.7361809045226131

In [17]:
# pip install imblearn

In [18]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

rus = RandomUnderSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_rus, y_rus = rus.fit_sample(X, y)

ModuleNotFoundError: No module named 'imblearn'

In [19]:
y.value_counts()

NameError: name 'y' is not defined

In [42]:
y_rus.value_counts()

No     1869
Yes    1869
Name: Churn, dtype: int64

In [43]:
transformer = StandardScaler().fit(X_rus)
X = transformer.transform(X_rus)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, y_rus)
classification.score(X, y_rus)

0.7335473515248796

In [44]:
ros = RandomOverSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_ros, y_ros = ros.fit_sample(X, y)

In [45]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [47]:
y_ros.value_counts()

Yes    5174
No     5174
Name: Churn, dtype: int64

In [48]:
transformer = StandardScaler().fit(X_ros)
X = transformer.transform(X_ros)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, y_ros)
classification.score(X, y_ros)

0.7342481638964051

### Synthetic Minority Oversampling TEchnique (SMOTE)

In [14]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_sm, y_sm = smote.fit_sample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

### UnderSampling using TomekLinks 

Tomek links are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process.

In [15]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl.value_counts()

No     4694
Yes    1869
Name: Churn, dtype: int64

In [16]:
X_tl2, y_tl2 = tl.fit_sample(X_tl, y_tl)
y_tl2.value_counts()

No     4537
Yes    1869
Name: Churn, dtype: int64

In [55]:
# It does not make the two classes equal but only removes the points from the majority 
# class that are close to other poitns in minority class

In [61]:
# Similar way it works for multi label classification models 
# But it can be a little tricky 

In [56]:
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']

In [58]:
y[6000:7000] = 'Hello'
y.value_counts()

No       4445
Yes      1598
Hello    1000
Name: Churn, dtype: int64

In [59]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl.value_counts()

No       3747
Yes      1598
Hello    1000
Name: Churn, dtype: int64

In [60]:
X_sm, y_sm = smote.fit_sample(X, y)
y_sm.value_counts()

Yes      4445
Hello    4445
No       4445
Name: Churn, dtype: int64