Handling imbalanced Dataset
1. Up sampling
2. Down sampling

In [49]:
import numpy as np
import pandas as pd




In [50]:
##  Set the random seed for reproducibility
np.random.seed(342)

## create a dataframe with two classes
n_samples=1000
class_0_ratio=0.9
n_class_0=int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [51]:
n_class_0,n_class_1

(900, 100)

In [52]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [53]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [54]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.088148,-0.514405,0
1,-0.526802,0.682149,0
2,-0.190451,0.08332,0
3,0.461833,-0.114706,0
4,-1.975934,-0.241695,0


In [55]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,2.929401,-0.719177,1
996,3.323463,1.84836,1
997,3.571997,3.155143,1
998,2.262996,4.220162,1
999,1.913612,3.422359,1


In [56]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [57]:
## upsampling
df_minority=df[df['target']==1]
df_minority=df[df['target']==0]

In [58]:
!pip install scikit-learn



In [59]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [62]:
## upsampling
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [63]:
from sklearn.utils import resample
df_minority_upsampled=resample(df_minority,replace=True, #Sample With replacement
         n_samples=len(df_majority),
         random_state=42
        )

In [64]:
df_minority_upsampled.shape

(900, 3)

In [65]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,2.420625,2.089477,1
992,2.060364,1.64674,1
914,2.372015,1.535997,1
971,1.017583,1.831833,1
960,1.237624,1.68535,1


In [66]:
df_upsampled=pd.concat([df_majority,df_minority_upsampled])

In [67]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

Down Sampling

In [68]:
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

target
0    900
1    100
Name: count, dtype: int64


In [69]:
## downsampling 
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [70]:
from sklearn.utils import resample
df_majority_upsampled=resample(df_minority,replace=True, #Sample With replacement
         n_samples=len(df_majority),
         random_state=42
        )