# Playground for Class Imbalance Slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2020-07-09 08:22:25.350088


In [2]:
import pandas as pd
pd.show_versions(as_json=False)

import sklearn
sklearn.__version__


INSTALLED VERSIONS
------------------
commit           : None
python           : 3.6.10.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
machine          : AMD64
processor        : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : None.None

pandas           : 1.0.3
numpy            : 1.18.1
pytz             : 2019.3
dateutil         : 2.8.1
pip              : 20.0.2
setuptools       : 46.1.3.post20200325
Cython           : None
pytest           : 5.4.1
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.5.0
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.13.0
pandas_datareader: None
bs4              : 4.9.0
bottleneck       : None
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.5.0


'0.21.3'

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load Dataset

The following dataset is a well-known imbalanced dataset from Kaggle:
    https://www.kaggle.com/mlg-ulb/creditcardfraud/home?select=creditcard.csv

In [4]:
df = pd.read_csv('data/creditcard.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Take a random sample, and then create X, y variables, and then split into training and testing.

In [5]:
from sklearn.model_selection import train_test_split

# This dataset is huge, so let's take a sample to speed things up
# df = df.sample(frac=0.5, replace=False, random_state=1, axis=0)
X = df.drop(['Class'], axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=44)

In [6]:
y_train.value_counts()
y_test.value_counts()

0    227451
1       394
Name: Class, dtype: int64

0    56864
1       98
Name: Class, dtype: int64

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

# Helper function
def quick_evaluate_with_dt(X_train, X_test, y_train, y_test, name, balance_weights=False):
    
    cw = None
    if balance_weights == True:
        cw = 'balanced'
        
    clf = DecisionTreeClassifier(random_state=0, class_weight=cw)
    clf = RandomForestClassifier(random_state=0, n_estimators=100, class_weight=cw)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy       = accuracy_score(y_test, y_pred)
    f1             = f1_score(y_test, y_pred)
    recall         = recall_score(y_test, y_pred)
    precision      = precision_score(y_test, y_pred)
    roc_auc        = roc_auc_score(y_test, y_pred)
    
    df = pd.DataFrame({"Method"    : [name],
                       "Neg"       : [tn + fn],
                       "True Neg"  : [tn],
                       "False Neg" : [fn],
                       "Pos"       : [tp + fp],
                       "TP"        : [tp],
                       "FP"        : [fp],
                       "Accuracy"  : [accuracy],
                       "Recall"    : [recall],
                       "Precision" : [precision],
                       "F1"        : [f1],
                       "AUC"       : [roc_auc],
                      })
    
    print(df)
    return df

In [8]:
evals = list()

In [9]:
X_train.shape
y_train.shape
np.bincount(y_train)

evals.append(quick_evaluate_with_dt(X_train, X_test, y_train, y_test, 'None'))

(227845, 30)

(227845,)

array([227451,    394], dtype=int64)

  Method    Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0   None  56878     56859         19   84  79   5  0.999579  0.806122   

   Precision        F1       AUC  
0   0.940476  0.868132  0.903017  


In [10]:
X_train.shape
y_train.shape
np.bincount(y_train)

evals.append(quick_evaluate_with_dt(X_train, X_test, y_train, y_test, 'Class Weights', balance_weights=True))

(227845, 30)

(227845,)

array([227451,    394], dtype=int64)

          Method    Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Class Weights  56878     56858         20   84  78   6  0.999544  0.795918   

   Precision        F1       AUC  
0   0.928571  0.857143  0.897906  


In [11]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test, 'Over Random'))

(454902, 30)

(454902,)

array([227451, 227451], dtype=int64)

        Method    Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Over Random  56876     56858         18   86  80   6  0.999579  0.816327   

   Precision        F1       AUC  
0   0.930233  0.869565  0.908111  


In [12]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=0).fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test, 'Over SMOTE'))

(454902, 30)

(454902,)

array([227451, 227451], dtype=int64)

       Method    Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Over SMOTE  56868     56852         16   94  82  12  0.999508  0.836735   

   Precision        F1       AUC  
0    0.87234  0.854167  0.918262  


In [13]:
from imblearn.over_sampling import ADASYN

X_resampled, y_resampled = ADASYN(random_state=0).fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test,  'Over ADASYN'))

(454953, 30)

(454953,)

array([227451, 227502], dtype=int64)

        Method    Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Over ADASYN  56869     56852         17   93  81  12  0.999491  0.826531   

   Precision        F1      AUC  
0   0.870968  0.848168  0.91316  


In [14]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test,  'Under Sample'))

(788, 30)

(788,)

array([394, 394], dtype=int64)

         Method    Neg  True Neg  False Neg   Pos  TP    FP  Accuracy  \
0  Under Sample  55248     55237         11  1714  87  1627  0.971244   

     Recall  Precision        F1       AUC  
0  0.887755   0.050758  0.096026  0.929571  


In [15]:
# Recall the actual distrition of the truth labels of the testing set.
y_test.value_counts()

evals_all = pd.concat([m for m in evals], axis = 0).reset_index()

evals_all = evals_all.drop(columns = "index",axis =1)
evals_all.sort_values(by=['F1'], ascending=False)

0    56864
1       98
Name: Class, dtype: int64

Unnamed: 0,Method,Neg,True Neg,False Neg,Pos,TP,FP,Accuracy,Recall,Precision,F1,AUC
2,Over Random,56876,56858,18,86,80,6,0.999579,0.816327,0.930233,0.869565,0.908111
0,,56878,56859,19,84,79,5,0.999579,0.806122,0.940476,0.868132,0.903017
1,Class Weights,56878,56858,20,84,78,6,0.999544,0.795918,0.928571,0.857143,0.897906
3,Over SMOTE,56868,56852,16,94,82,12,0.999508,0.836735,0.87234,0.854167,0.918262
4,Over ADASYN,56869,56852,17,93,81,12,0.999491,0.826531,0.870968,0.848168,0.91316
5,Under Sample,55248,55237,11,1714,87,1627,0.971244,0.887755,0.050758,0.096026,0.929571


# Another Dataset

The dataset is also imbalanced, although less so. It comes from:

https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [16]:
df_d = pd.read_csv('data/diabetes_orig.csv')
df_d.info()
df_d.head()

X = df_d.drop(['Id', 'diabetes'], axis=1)
y = df_d['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=44)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  768 non-null    int64  
 1   num_times_pregnant  768 non-null    int64  
 2   plasma_glucose      768 non-null    int64  
 3   DBP                 768 non-null    int64  
 4   triceps_skin        768 non-null    int64  
 5   serum_insulin       768 non-null    int64  
 6   BMI                 768 non-null    float64
 7   pedigree            768 non-null    float64
 8   age                 768 non-null    int64  
 9   diabetes            768 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 60.1 KB


Unnamed: 0,Id,num_times_pregnant,plasma_glucose,DBP,triceps_skin,serum_insulin,BMI,pedigree,age,diabetes
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [17]:
y_train.value_counts()
y_test.value_counts()

0    400
1    214
Name: diabetes, dtype: int64

0    100
1     54
Name: diabetes, dtype: int64

In [18]:
evals_d = list()

In [19]:
X_train.shape
y_train.shape
np.bincount(y_train)

evals_d.append(quick_evaluate_with_dt(X_train, X_test, y_train, y_test, 'None'))

(614, 8)

(614,)

array([400, 214], dtype=int64)

  Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0   None  116        88         28   38  26  12   0.74026  0.481481   

   Precision        F1       AUC  
0   0.684211  0.565217  0.680741  


In [20]:
X_train.shape
y_train.shape
np.bincount(y_train)

evals_d.append(quick_evaluate_with_dt(X_train, X_test, y_train, y_test, 'Class Weights', balance_weights=True))

(614, 8)

(614,)

array([400, 214], dtype=int64)

          Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Class Weights  117        92         25   37  29   8  0.785714  0.537037   

   Precision        F1       AUC  
0   0.783784  0.637363  0.728519  


In [21]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals_d.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test, 'Over Random'))

(800, 8)

(800,)

array([400, 400], dtype=int64)

        Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Over Random  110        86         24   44  30  14  0.753247  0.555556   

   Precision        F1       AUC  
0   0.681818  0.612245  0.707778  


In [22]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=0).fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals_d.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test, 'Over SMOTE'))

(800, 8)

(800,)

array([400, 400], dtype=int64)

       Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Over SMOTE  105        84         21   49  33  16   0.75974  0.611111   

   Precision        F1       AUC  
0   0.673469  0.640777  0.725556  


In [23]:
from imblearn.over_sampling import ADASYN

X_resampled, y_resampled = ADASYN(random_state=0).fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals_d.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test,  'Over ADASYN'))

(775, 8)

(775,)

array([400, 375], dtype=int64)

        Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy   Recall  \
0  Over ADASYN   99        79         20   55  34  21  0.733766  0.62963   

   Precision        F1       AUC  
0   0.618182  0.623853  0.709815  


In [24]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

X_resampled.shape
y_resampled.shape
np.bincount(y_resampled)

evals_d.append(quick_evaluate_with_dt(X_resampled, X_test, y_resampled, y_test,  'Under Sample'))

(428, 8)

(428,)

array([214, 214], dtype=int64)

         Method  Neg  True Neg  False Neg  Pos  TP  FP  Accuracy    Recall  \
0  Under Sample   92        73         19   62  35  27  0.701299  0.648148   

   Precision        F1       AUC  
0   0.564516  0.603448  0.689074  


In [25]:
# Recall the actual distrition of the truth labels of the testing set.
y_test.value_counts()

evals_d_all = pd.concat([m for m in evals_d], axis = 0).reset_index()

evals_d_all = evals_d_all.drop(columns = "index",axis =1)
evals_d_all.sort_values(by=['F1'], ascending=False)

0    100
1     54
Name: diabetes, dtype: int64

Unnamed: 0,Method,Neg,True Neg,False Neg,Pos,TP,FP,Accuracy,Recall,Precision,F1,AUC
3,Over SMOTE,105,84,21,49,33,16,0.75974,0.611111,0.673469,0.640777,0.725556
1,Class Weights,117,92,25,37,29,8,0.785714,0.537037,0.783784,0.637363,0.728519
4,Over ADASYN,99,79,20,55,34,21,0.733766,0.62963,0.618182,0.623853,0.709815
2,Over Random,110,86,24,44,30,14,0.753247,0.555556,0.681818,0.612245,0.707778
5,Under Sample,92,73,19,62,35,27,0.701299,0.648148,0.564516,0.603448,0.689074
0,,116,88,28,38,26,12,0.74026,0.481481,0.684211,0.565217,0.680741
