## Dealing With Imbalanced Dataset

In [1]:
import modin.pandas as modin_pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
x_train = modin_pd.read_csv("x_train.csv")
y_train = modin_pd.read_csv("y_train.csv")

### Method1: Random Over Sampling

In [3]:
!pip install imblearn

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

def Random_Oversampling(x_train, y_train):
    print('Before ROS: {0}'.format(Counter(y_train['Class'])))
    ros = RandomOverSampler(random_state=42, sampling_strategy='auto') # 'auto' means proportion --- 1:1
    x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)
    print('After ROS: {0}'.format(Counter(y_train_ros['Class'])))
    return x_train_ros, y_train_ros

In [5]:
x_train_ros, y_train_ros = Random_Oversampling(x_train, y_train)
print("x_train_ros shape:" + str(x_train_ros.shape))
print("y_train_ros shape:" + str(y_train_ros.shape))
x_train_ros.to_csv('x_train_ros.csv', header = True, index = False)
y_train_ros.to_csv('y_train_ros.csv', header = True, index = False)

Before ROS: Counter({0: 227451, 1: 394})
After ROS: Counter({0: 227451, 1: 227451})
x_train_ros shape:(454902, 15)
y_train_ros shape:(454902, 1)


### Method2：SMOTE（Synthetic Minority Oversampling Technique）

In [6]:
from imblearn.over_sampling import SMOTE

def SMOTE_method(x_train, y_train):
    print('Before SMOTE: {0}'.format(Counter(y_train['Class'])))
    smote = SMOTE(random_state=42)
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
    print('After SMOTE: {0}'.format(Counter(y_train_smote['Class'])))
    return x_train_smote, y_train_smote

In [7]:
x_train_smote, y_train_smote = SMOTE_method(x_train, y_train)
print("x_train_smote shape:" + str(x_train_smote.shape))
print("y_train_smote shape:" + str(y_train_smote.shape))
x_train_smote.to_csv('x_train_smote.csv', header = True, index = False)
y_train_smote.to_csv('y_train_smote.csv', header = True, index = False)

Before SMOTE: Counter({0: 227451, 1: 394})
After SMOTE: Counter({0: 227451, 1: 227451})
x_train_smote shape:(454902, 15)
y_train_smote shape:(454902, 1)


### Method3：Combined Sampling（SMOTETomek）

In [8]:
from imblearn.combine import SMOTETomek

def Combined_sampling(x_train, y_train):
    print('Before CS: {0}'.format(Counter(y_train['Class'])))
    cs = SMOTETomek(random_state=42)
    x_train_cs, y_train_cs = cs.fit_resample(x_train, y_train)
    print('After CS: {0}'.format(Counter(y_train_cs['Class'])))
    return x_train_cs, y_train_cs


In [9]:
x_train_cs, y_train_cs = Combined_sampling(x_train, y_train)
print("x_train_cs shape: " + str(x_train_cs.shape))
print("y_train_cs shape: " + str(y_train_cs.shape))
x_train_cs.to_csv('x_train_cs.csv', header=True, index=False)
y_train_cs.to_csv('y_train_cs.csv', header=True, index=False)

Before CS: Counter({0: 227451, 1: 394})
After CS: Counter({0: 227451, 1: 227451})
x_train_cs shape: (454902, 15)
y_train_cs shape: (454902, 1)


### Method4：ADASYN（Adaptive Synthetic Sampling）

In [10]:
from imblearn.over_sampling import ADASYN

def ADASYN_sampling(x_train, y_train):
    print('Before ADASYN: {0}'.format(Counter(y_train['Class'])))
    adasyn = ADASYN(random_state=42)
    x_train_adasyn, y_train_adasyn = adasyn.fit_resample(x_train, y_train)
    print('After ADASYN: {0}'.format(Counter(y_train_adasyn['Class'])))
    return x_train_adasyn, y_train_adasyn

x_train_adasyn, y_train_adasyn = ADASYN_sampling(x_train, y_train)
print("x_train_adasyn shape: " + str(x_train_adasyn.shape))
print("y_train_adasyn shape: " + str(y_train_adasyn.shape))
x_train_adasyn.to_csv('x_train_adasyn.csv', header=True, index=False)
y_train_adasyn.to_csv('y_train_adasyn.csv', header=True, index=False)


Before ADASYN: Counter({0: 227451, 1: 394})
After ADASYN: Counter({0: 227451, 1: 227444})
x_train_adasyn shape: (454895, 15)
y_train_adasyn shape: (454895, 1)


### Method5: NearMiss

In [11]:
from imblearn.under_sampling import NearMiss

def NearMiss_sampling(x_train, y_train):
    print('Before NearMiss: {0}'.format(Counter(y_train['Class'])))
    nearmiss = NearMiss(version=2,sampling_strategy='auto')
    x_train_nearmiss, y_train_nearmiss = nearmiss.fit_resample(x_train, y_train)
    print('After NearMiss: {0}'.format(Counter(y_train_nearmiss['Class'])))
    return x_train_nearmiss, y_train_nearmiss

x_train_nearmiss, y_train_nearmiss = NearMiss_sampling(x_train, y_train)
print("x_train_nearmiss shape: " + str(x_train_nearmiss.shape))
print("y_train_nearmiss shape: " + str(y_train_nearmiss.shape))
x_train_nearmiss.to_csv('x_train_nearmiss.csv', header=True, index=False)
y_train_nearmiss.to_csv('y_train_nearmiss.csv', header=True, index=False)


Before NearMiss: Counter({0: 227451, 1: 394})
After NearMiss: Counter({0: 394, 1: 394})
x_train_nearmiss shape: (788, 15)
y_train_nearmiss shape: (788, 1)
