# Imbalanced Dataset Handling

### Upsampling
### Downsampling

In [2]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(123)

# Create a dataframw with two Classses 
n_samples = 1000
class_0_ratio = 0.9 
n_class_0 = int(n_samples*class_0_ratio)
n_class_1 = n_samples - n_class_0

In [14]:
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature_2':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0
})

class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0,scale=1,size=n_class_1),
    'feature_2':np.random.normal(loc=0,scale=1,size=n_class_1),
    'target':[1]*n_class_1
})

In [15]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [16]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [18]:
# Up Samplingg

df_minority = df[df.target==1]
df_majority = df[df.target==0]
                 

In [19]:
df_minority

Unnamed: 0,feature_1,feature_2,target
900,-0.650884,-1.178457,1
901,2.079573,-0.801016,1
902,1.445297,-0.730579,1
903,-0.668054,0.984417,1
904,-0.581201,-1.055938,1
...,...,...,...
995,0.317874,-1.433189,1
996,2.203088,-0.370110,1
997,0.477481,-0.284352,1
998,1.767493,0.373888,1


In [20]:
df_majority

Unnamed: 0,feature_1,feature_2,target
0,-2.643425,-0.093436,0
1,-0.448991,1.650653,0
2,-0.358907,-0.845004,0
3,0.133194,0.395839,0
4,-0.644242,-1.342092,0
...,...,...,...
895,-0.619840,1.140850,0
896,0.524523,-0.571378,0
897,0.666087,0.675585,0
898,0.448143,0.244925,0


In [31]:
# Upscaling Perform 
from sklearn.utils import resample

In [32]:
df_minority_upsampled = resample(df_minority,
                                 replace=True, ## Sample With Replacement
                                 n_samples=len(df_majority) # To match majority class
                                 ,random_state=42
                                )

In [33]:
df_minority_upsampled.shape

(900, 3)

In [34]:
df_minority_upsampled['target'].value_counts()

1    900
Name: target, dtype: int64

In [39]:
df_upsampled = pd.concat([df_majority,df_minority_upsampled])

In [41]:
df_upsampled.shape

(1800, 3)

# Down Sampling

In [42]:
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature_2':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0
})

class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0,scale=1,size=n_class_1),
    'feature_2':np.random.normal(loc=0,scale=1,size=n_class_1),
    'target':[1]*n_class_1
})

In [45]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [46]:
df_minority = df[df.target==1]
df_majority = df[df.target==0]
                 

In [49]:
df_majority_downsampled= resample(df_majority,
                                 replace=False, ## Sample With Replacement
                                 n_samples=len(df_minority) # To match majority class
                                 ,random_state=42
                                )

In [50]:
df_majority_downsampled

Unnamed: 0,feature_1,feature_2,target
70,-0.123805,-0.094300,0
827,0.495958,-0.199839,0
231,1.014114,-1.668710,0
588,0.043981,-0.181916,0
39,-0.359285,-1.577710,0
...,...,...,...
398,-0.158115,-0.280113,0
76,0.014385,0.116049,0
196,-1.389215,0.520615,0
631,-0.004800,-0.324563,0


In [51]:
pd.concat([df_minority,df_majority_downsampled])

Unnamed: 0,feature_1,feature_2,target
900,-1.128260,0.536811,1
901,-0.290360,-1.082676,1
902,-0.186490,-0.250598,1
903,0.810140,-1.664906,1
904,-1.276904,-0.015444,1
...,...,...,...
398,-0.158115,-0.280113,0
76,0.014385,0.116049,0
196,-1.389215,0.520615,0
631,-0.004800,-0.324563,0
