# Treating imbalanced Class

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
from collections import Counter
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [3]:
# for evaluation

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import classification_report,roc_auc_score, roc_curve
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

Using TensorFlow backend.


In [4]:
# for class imbalnce

from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

Data loaded from previous work. 

Case of end customers' travelling behavior from online system 

In [6]:
df = pd.read_csv('selected_feature.csv')
df.shape

(45805, 515)

In [7]:
df.head()

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,469.781624,2,3.0,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1498.817537,1,3.0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2921.339028,4,6.0,22,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- event_type is our target variable.
- 1 means booking activity
- 0 means searching activity

In [8]:
df.event_type.value_counts()

0    43997
1     1808
Name: event_type, dtype: int64

In [9]:
df.event_type.value_counts(normalize=True)

0    0.960528
1    0.039472
Name: event_type, dtype: float64

We have our target variable with only 3% of class 1 i.e class of our interest.

### 3)-Model Building

### 3.1)- Separate X and y

In [10]:
target=df["event_type"]
features=df.drop(['event_type'], axis=1)

In [11]:
print(target.shape)
print(features.shape)

(45805,)
(45805, 514)


### 3.2)- Normalize data

This will take care of those spike values in variables like distance

In [12]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(features)

### 3.3)-train_test_split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [14]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(32063, 514)
(13742, 514)
(32063,)
(13742,)


### 3.4)-Applying Logistic Classifier

In [15]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [16]:
print(accuracy_score(y_test,predictions_LR))

0.9598311745015282


In [17]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     13217
           1       0.03      0.00      0.00       525

    accuracy                           0.96     13742
   macro avg       0.50      0.50      0.49     13742
weighted avg       0.93      0.96      0.94     13742



Accuracy score is good but, is it enough? Looking at recall, precision does not seem to be case where search class is doing very well but, booking event type class is poor. This is where we say that accuracy isn't a good matrics to evaluate our model.

There are two problems here.

- 1- Imbalnced class 
- 2- Selecting correct evaluation matric

In [18]:
#let's review event_type again

booking = df[df['event_type']==1]

search = df[df['event_type']==0]

In [19]:
print(booking.shape)
print(search.shape)

(1808, 515)
(43997, 515)


Book class has only 3.9 % of values i.e 1808. This is why our model did well for accuracy. As accuracy is overall picture how model predicts correct values. Unfortunately those correct values are search class samples. We are interested in onversion likelihood i.e search to booking cases. 

Luckily, there is solution to this 

# 4)- Solving imbalnced Class Problem



In [20]:
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [21]:
def print_results(headline, true_value, pred):
    print(headline)
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))
    
    

In [22]:
classifier = LogisticRegression

**We shall use make_pipe**

In [23]:
# build normal model (we already have done that yet doing it again)
pipeline = make_pipeline(classifier(random_state=42))
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

In [24]:
# build model with undersampling
nearmiss_pipeline = make_pipeline_imb(NearMiss(), classifier(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

In [25]:
# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

### 4.1.Evaluation

In [26]:
# print information about three models
print()
print("normal data distribution: {}".format(Counter(target)))
X_smote, y_smote = SMOTE().fit_sample(X, target)
print("SMOTE data distribution: {}".format(Counter(y_smote)))
X_nearmiss, y_nearmiss = NearMiss().fit_sample(X, target)
print("NearMiss data distribution: {}".format(Counter(y_nearmiss)))


normal data distribution: Counter({0: 43997, 1: 1808})
SMOTE data distribution: Counter({0: 43997, 1: 43997})
NearMiss data distribution: Counter({0: 1808, 1: 1808})


### 4.2.classification report

In [27]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     13217
           1       0.00      0.00      0.00       525

    accuracy                           0.96     13742
   macro avg       0.48      0.50      0.49     13742
weighted avg       0.93      0.96      0.94     13742



In [28]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, smote_prediction))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.55      0.58      0.70      0.57      0.32     13217
          1       0.05      0.58      0.55      0.09      0.57      0.32       525

avg / total       0.94      0.55      0.58      0.68      0.57      0.32     13742



### 4.3.accuracy sore

In [29]:
print()
print('normal Pipeline Score {}'.format(pipeline.score(X_test, y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))
print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(X_test, y_test)))


normal Pipeline Score 0.961650414786785
SMOTE Pipeline Score 0.5510114975986028
NearMiss Pipeline Score 0.22165623635569787


### 4.4.precision, recall & f1_score

In [30]:
print()
print_results("normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)


normal classification
precision: 0.0
recall: 0.0
f1: 0.0
f1: 0.0

SMOTE classification
precision: 0.04890522614671568
recall: 0.5828571428571429
f1: 0.09023886759068121
f1: 0.09023886759068121

NearMiss classification
precision: 0.03552835875422413
recall: 0.7409523809523809
f1: 0.06780547324385568
f1: 0.06780547324385568


# K-Fold

In [31]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train = X[train_index]
    y_train = target[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
    X_test = X[test_index]
    y_test = target[test_index]  # See comment on ravel and  y_train
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    model = LogisticRegression()  # Choose a model here
    model.fit(X_train_oversampled, y_train_oversampled )  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)}')
    print(f'f-score: {f1_score(y_test, y_pred)}')


For fold 1:
Accuracy: 0.5895644580285995
f-score: 0.2407108239095315
For fold 2:
Accuracy: 0.5776661936469818
f-score: 0.02617669267556003
For fold 3:
Accuracy: 0.5786486191463814
f-score: 0.0
For fold 4:
Accuracy: 0.5788669359240257
f-score: 0.0005181347150259067
For fold 5:
Accuracy: 0.5894552996397773
f-score: 0.0010624169986719787


### cross validation done wrong

- Thanks to https://www.youtube.com/watch?v=DQC_YE3I5ig tutorial, I have found what is right and what is wrong way to apply K-fold.

kf = KFold(n_splits=5, random_state=42) <br>
accuracy = [] <br>
precision = [] <br>
recall = [] <br>
f1 = [] <br>
auc = [] <br>
X, y = SMOTE().fit_sample(X_train, y_train) <br>
for train, test in kf.split(X, y): <br>
    pipeline = make_pipeline(classifier(random_state=42)) <br>
    model = pipeline.fit(X[train], y[train]) <br>
    prediction = model.predict(X[test]) <br>
    accuracy.append(pipeline.score(X[test], y[test])) <br>
    precision.append(precision_score(y[test], prediction)) <br>
    recall.append(recall_score(y[test], prediction)) <br>
    f1.append(f1_score(y[test], prediction)) <br>

print("done wrong mean of scores 5-fold:") <br>
print("accuracy: {}".format(np.mean(accuracy))) <br>
print("precision: {}".format(np.mean(precision))) <br>
print("recall: {}".format(np.mean(recall))) <br>
print("f1: {}".format(np.mean(f1))) <br>

**END OF NOTEBOOK**