## Handling Imbalanced Dataset

1.Up Sampling 
2.Down Sampling

In [20]:
import numpy as np 
import pandas as pd  

np.random.seed(1000) 
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [21]:
n_class_0 , n_class_1

(900, 100)

In [22]:


# Class 0 DataFrame
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

# Class 1 DataFrame
class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1  # Changed to 1 for binary target classes
})

# Combine the dataframes
df = pd.concat([class_0, class_1]).reset_index(drop=True)


In [23]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.804458,-0.943086,0
1,0.320932,-0.600078,0
2,-0.025483,0.819486,0
3,0.644324,0.500987,0
4,-0.300797,-0.627956,0


In [24]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,1.359728,1.34701,1
996,1.583076,2.433163,1
997,3.048504,-0.10156,1
998,1.311551,1.712508,1
999,3.361135,0.972246,1


In [25]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [26]:
## Upsampling 

df_minority= df[df['target'] == 1]
df_majority = df[df['target'] == 0]


In [27]:
from sklearn.utils import resample

df_minority_unsampled = resample(df_minority , replace=True,
         n_samples=len(df_majority),
         random_state=42
         )

In [28]:
df_minority_unsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.079793,1.965178,1
992,3.743739,0.843806,1
914,2.346936,2.28386,1
971,1.141542,2.095748,1
960,1.289481,1.941745,1


In [29]:
df_minority_unsampled.tail()

Unnamed: 0,feature_1,feature_2,target
952,2.929377,2.33691,1
965,2.436201,0.754867,1
976,2.418901,1.535586,1
942,1.7034,1.703804,1
974,1.518401,2.179579,1


In [31]:
df_upsampled = pd.concat([df_majority ,df_minority_unsampled] )

In [32]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

In [None]:
# Down Sampling

df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]


In [33]:
from sklearn.utils import resample 

df_majority_downsampled = resample(df_majority , replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42
                                   )

In [35]:
df_majority_downsampled.shape

(100, 3)

In [36]:
df_downsampled = pd.concat([df_minority ,df_majority_downsampled] )

In [37]:
df_downsampled.target.value_counts()

target
1    100
0    100
Name: count, dtype: int64