## Handling Imbalanced Data
### 1. Up Sampling 
### 2. Down Sampling 

In [1]:
import numpy as np
import pandas as  pd

# set the random seed for reproductibility
np.random.seed(123)

# Create 900 class 0 and 100 class 1 samples
n_samples = 1000

n_class_0 = 900
n_class_1 = 100

# Create 900 samples of class 0
class_0 = pd.DataFrame({
    'feature_1': np.random.randn(900),
    'feature_2': np.random.randn(900),
    'target': [0] * 900
})

# Create 100 samples of class 1
class_1 = pd.DataFrame({
    'feature_1': np.random.randn(100),
    'feature_2': np.random.randn(100),
    'target': [1] * 100 
})


In [4]:
## Combine Dataset
dataset = pd.concat([class_0, class_1], ignore_index=True)

In [5]:
# Confirm both classes exist
print("Class counts:\n", dataset['target'].value_counts())

Class counts:
 target
0    900
1    100
Name: count, dtype: int64


In [6]:
n_class_0,n_class_1

(900, 100)

In [8]:
## Upsampling
dataset_minority=dataset[dataset['target']==1]
dataset_majority=dataset[dataset['target']==0]

In [9]:
print("Majority:", len(dataset_majority))
print("Minority:", len(dataset_minority))

Majority: 900
Minority: 100


In [11]:
from sklearn.utils import resample

dataset_minority_upsampled = resample(
    dataset_minority,
    replace=True,                           # Sample with replacement
    n_samples=len(dataset_majority),        # Match number of majority class
    random_state=123
)


In [15]:
dataset_minority_upsampled.shape

(900, 3)

In [12]:
dataset_balanced = pd.concat([dataset_majority, dataset_minority_upsampled])
dataset_balanced = dataset_balanced.sample(frac=1, random_state=123).reset_index(drop=True)


In [14]:
print(dataset_balanced['target'].value_counts())
print(dataset_balanced.head())

target
0    900
1    900
Name: count, dtype: int64
   feature_1  feature_2  target
0   0.691423   1.433240       0
1   1.013493   0.047240       1
2  -1.213385   0.675504       0
3  -0.119519  -0.160798       0
4  -1.825028   1.216906       1


### Down sampling

In [16]:
import numpy as np
import pandas as pd
from sklearn.utils import resample

np.random.seed(123)  # For reproducibility

# Class 0: 900 samples (majority)
class_0 = pd.DataFrame({
    'feature_1': np.random.randn(900),
    'feature_2': np.random.randn(900),
    'target': [0] * 900
})

# Class 1: 100 samples (minority)
class_1 = pd.DataFrame({
    'feature_1': np.random.randn(100),
    'feature_2': np.random.randn(100),
    'target': [1] * 100
})

# Combine into a single dataset
dataset = pd.concat([class_0, class_1]).reset_index(drop=True)



In [17]:
dataset_majority = dataset[dataset['target'] == 0]
dataset_minority = dataset[dataset['target'] == 1]


In [18]:
dataset_majority_downsampled = resample(
    dataset_majority,
    replace=False,                         # No replacement
    n_samples=len(dataset_minority),      # Match minority class
    random_state=123
)


In [19]:
dataset_downsampled = pd.concat([dataset_majority_downsampled, dataset_minority])
dataset_downsampled = dataset_downsampled.sample(frac=1, random_state=123).reset_index(drop=True)


In [20]:
print(dataset_downsampled['target'].value_counts())
print(dataset_downsampled.head())


target
0    100
1    100
Name: count, dtype: int64
   feature_1  feature_2  target
0   0.744819   0.116802       0
1   1.382173   0.913005       1
2  -1.727669  -0.713989       0
3   0.944820  -0.722076       1
4   1.651437  -0.116932       0
