Imbalanced Dataset Handling :-

1. Upsampling
2. Downsampling

In [27]:
import numpy as np
import pandas as pd

In [28]:
# Set the random dataset for reproducibility
np.random.seed(123)

# Create a dataframe with 2 classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [29]:
n_class_0, n_class_1

(900, 100)

In [30]:
# Creating dataframe:

class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2' : np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

# It will create dataset containing 900 0's and 100 1's.

In [31]:
class_1

Unnamed: 0,feature_1,feature_2,target
0,1.699768,2.139033,1
1,1.367739,2.025577,1
2,1.795683,1.803557,1
3,2.213696,3.312255,1
4,3.033878,3.187417,1
...,...,...,...
95,1.376371,2.845701,1
96,2.239810,0.880077,1
97,1.131760,1.640703,1
98,2.902006,0.390305,1


In [32]:
pd.concat([class_0, class_1])

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
95,1.376371,2.845701,1
96,2.239810,0.880077,1
97,1.131760,1.640703,1
98,2.902006,0.390305,1


In [33]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [34]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [35]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,100


UPSAMPLING

In [36]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [37]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1


In [38]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [39]:
# Perform Upsampling

from sklearn.utils import resample

In [40]:
df_minority_upsample = resample(
    df_minority,
    replace = True, # Sample with replacement
    n_samples= len(df_majority), # to match the majority class
    random_state = 45
)


In [41]:
df_minority_upsample.shape

(900, 3)

In [42]:
df_minority.shape

(100, 3)

In [43]:
df_minority_upsample.head()

Unnamed: 0,feature_1,feature_2,target
975,2.192449,3.566033,1
930,1.245992,2.946908,1
903,2.213696,3.312255,1
932,1.91547,1.652041,1
995,1.376371,2.845701,1


In [45]:
df_minority_upsample.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
feature_1,feature_2,target,Unnamed: 3_level_1
1.487494,3.310876,1,16
0.342491,1.927940,1,16
3.382173,2.913005,1,15
2.181974,2.162665,1,15
3.011616,2.668087,1,15
...,...,...,...
1.367739,2.025577,1,4
4.125817,1.784590,1,4
2.909031,2.669214,1,3
2.844335,2.015572,1,3


In [47]:
df_minority_upsample['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,900


In [48]:
df_upsampled = pd.concat([df_majority, df_minority_upsample])

In [49]:
df_upsampled.shape

(1800, 3)

In [50]:
df_upsampled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,900


In [51]:
df_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [53]:
df_upsampled.tail()

Unnamed: 0,feature_1,feature_2,target
965,3.919526,1.980541,1
925,2.193094,2.19678,1
919,1.804892,2.842652,1
911,2.157959,1.613304,1
998,2.902006,0.390305,1


DOWNSAMPLING:

In [54]:
class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2' : np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [55]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [58]:
df_majority_downsample = resample(
    df_majority,
    replace = False, # Sample without replacement
    n_samples= len(df_minority), # to match the minority class
    random_state = 45
)


In [59]:
df_majority_downsample.shape

(100, 3)

In [61]:
df_downsample = pd.concat([df_minority, df_majority_downsample])

In [62]:
df_downsample['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,100
0,100
