# Lab Handling Data Imbalance Classification

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## Reading file and selecting data

In [2]:
churnData = pd.read_csv('./files_for_lab/Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
display(churnData[churnData['TotalCharges'] == '0']['TotalCharges'])
display(churnData[churnData['TotalCharges'] == ' ']['TotalCharges'])

Series([], Name: TotalCharges, dtype: object)

488      
753      
936      
1082     
1340     
3331     
3826     
4380     
5218     
6670     
6754     
Name: TotalCharges, dtype: object

In [6]:
# The spaces should be zeros

In [7]:
for i in range(len(churnData['TotalCharges'])):
    if churnData['TotalCharges'][i] == ' ':
        churnData['TotalCharges'][i] = '0'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  churnData['TotalCharges'][i] = '0'


In [8]:
display(churnData[churnData['TotalCharges'] == '0']['TotalCharges'])
display(churnData[churnData['TotalCharges'] == ' ']['TotalCharges'])

488     0
753     0
936     0
1082    0
1340    0
3331    0
3826    0
4380    0
5218    0
6670    0
6754    0
Name: TotalCharges, dtype: object

Series([], Name: TotalCharges, dtype: object)

In [9]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'])

In [10]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [11]:
churnData['Churn_num'] = 0
for i in range(len(churnData['Churn_num'])):
    if churnData['Churn'][i] == 'Yes':
        churnData['Churn_num'][i] = churnData['Churn_num'][i] + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  churnData['Churn_num'][i] = churnData['Churn_num'][i] + 1


In [12]:
display(churnData['Churn_num'].value_counts())
display(churnData['Churn'].value_counts())

0    5174
1    1869
Name: Churn_num, dtype: int64

No     5174
Yes    1869
Name: Churn, dtype: int64

#### Selecting data

In [13]:
features_to_use = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
features_to_use

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.50
2,2,0,53.85,108.15
3,45,0,42.30,1840.75
4,2,0,70.70,151.65
...,...,...,...,...
7038,24,0,84.80,1990.50
7039,72,0,103.20,7362.90
7040,11,0,29.60,346.45
7041,4,1,74.40,306.60


#### Scaling

In [14]:
scaled = StandardScaler().fit_transform(features_to_use)

In [15]:
scaled_features = pd.DataFrame(scaled,columns=features_to_use.columns)
scaled_features

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-1.277445,-0.439916,-1.160323,-0.992611
1,0.066327,-0.439916,-0.259629,-0.172165
2,-1.236724,-0.439916,-0.362660,-0.958066
3,0.514251,-0.439916,-0.746535,-0.193672
4,-1.236724,-0.439916,0.197365,-0.938874
...,...,...,...,...
7038,-0.340876,-0.439916,0.665992,-0.127605
7039,1.613701,-0.439916,1.277533,2.242606
7040,-0.870241,-0.439916,-1.168632,-0.852932
7041,-1.155283,2.273159,0.320338,-0.870513


In [16]:
final_scaled = pd.concat([scaled_features, churnData['Churn_num']], axis =1)
final_scaled

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn_num
0,-1.277445,-0.439916,-1.160323,-0.992611,0
1,0.066327,-0.439916,-0.259629,-0.172165,0
2,-1.236724,-0.439916,-0.362660,-0.958066,1
3,0.514251,-0.439916,-0.746535,-0.193672,0
4,-1.236724,-0.439916,0.197365,-0.938874,1
...,...,...,...,...,...
7038,-0.340876,-0.439916,0.665992,-0.127605,0
7039,1.613701,-0.439916,1.277533,2.242606,0
7040,-0.870241,-0.439916,-1.168632,-0.852932,0
7041,-1.155283,2.273159,0.320338,-0.870513,1


#### Train test

In [17]:
X = final_scaled.drop(['Churn_num'], axis = 1)
y = final_scaled['Churn_num']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
logistic_model = LogisticRegression()
model = logistic_model.fit(X_train,y_train)

In [20]:
logistic_model.score(X_test, y_test)

0.8062455642299503

## Checking the imbalance

In [21]:
final_scaled['Churn_num'].value_counts()

0    5174
1    1869
Name: Churn_num, dtype: int64

In [22]:
category_0 = final_scaled[final_scaled['Churn_num'] == 0]
category_1 = final_scaled[final_scaled['Churn_num'] == 1]

### Downsampling

In [23]:
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))

In [24]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


In [25]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [26]:
data_downsampled['Churn_num'].value_counts()

0    1869
1    1869
Name: Churn_num, dtype: int64

In [27]:
X = data_downsampled.drop(['Churn_num'], axis = 1)
y = data_downsampled['Churn_num']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
logistic_model.fit(X_train, y_train)
pred = logistic_model.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.7479674796747967
recall:  0.7479674796747967
f1:  0.7479674796747967


### Upsampling

In [30]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [31]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5174, 5)
(5174, 5)


In [32]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [33]:
data_upsampled['Churn_num'].value_counts()

0    5174
1    5174
Name: Churn_num, dtype: int64

In [34]:
X = data_upsampled.drop(['Churn_num'], axis = 1)
y = data_upsampled['Churn_num']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
logistic_model.fit(X_train, y_train)
pred = logistic_model.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.7265846736045412
recall:  0.7485380116959064
f1:  0.7373979836773883


### Smote

In [37]:
X = final_scaled.drop(['Churn_num'], axis = 1)
y = final_scaled['Churn_num']

In [38]:
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X,y)

In [39]:
X_train_SMOTE.shape

(10348, 4)

In [40]:
logistic_model.fit(X_train_SMOTE, y_train_SMOTE)
pred = logistic_model.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.7274453941120608
recall:  0.746588693957115
f1:  0.7368927368927369
