<a href="https://colab.research.google.com/github/Neerajmn28/Feature-Engineering/blob/main/Handling%20Imbalanced%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### What is Upsampling and Downsampling?
These are techniques used to balance the number of examples in each class of a classification problem, especially when one class heavily outweighs another (called class imbalance).


#### Why Do We Need It?
When your data is imbalanced, machine learning models tend to:

Favor the majority class

Perform poorly on the minority class (e.g., detecting fraud or disease)

So:

Upsampling and Downsampling help the model learn equally from both classes, improving its ability to detect the rare but important cases.


#### Which One to Use?
Upsampling is good when your dataset is small and you don’t want to lose data.

Downsampling is good when your dataset is large and has enough majority samples to remove without harming the model.

In [None]:
import numpy as np
import pandas as pd

In [None]:
np.random.seed(123) # Random seed for reproducibility

n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [None]:
n_class_0,n_class_1

(900, 100)

In [None]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [None]:
df = pd.concat([class_0,class_1]).reset_index(drop = True)

In [None]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,1.376371,2.845701,1
996,2.23981,0.880077,1
997,1.13176,1.640703,1
998,2.902006,0.390305,1
999,2.69749,2.01357,1


In [None]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,100


## Up Sampling

In [None]:
# upsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [None]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority, replace = True,
                                 n_samples = len(df_majority),
                                 random_state = 42)

In [None]:
df_minority_upsampled.shape

(900, 3)

In [None]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.19657,1.397425,1
914,1.93217,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1


In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [None]:
df_upsampled['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,900


## Down Sampling

In [None]:
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

target
0    900
1    100
Name: count, dtype: int64


In [None]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [None]:
print(df_minority.shape)
print(df_majority.shape)

(100, 3)
(900, 3)


In [None]:
from sklearn.utils import resample
df_majority_downsampled=resample(df_minority,replace=True, #Sample With replacement
         n_samples=len(df_minority),
         random_state=42
        )

In [None]:
df_majority_downsampled.shape

(100, 3)