# Handling imbalanced dataset

### 1. upsampling
### 2. downsampling

In [18]:
import numpy as np
import pandas as pd

# set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [2]:
n_class_0, n_class_1 

(900, 100)

In [3]:
# Create dataframe with imbalanced datset

np.random.seed(12)

class_0 = pd.DataFrame({
            "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
            "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
            "target": [0] * n_class_0
})

class_1 = pd.DataFrame({
            "feature_1": np.random.normal(loc=0, scale=1, size=n_class_1),
            "feature_2": np.random.normal(loc=0, scale=1, size=n_class_1),
            "target": [1] * n_class_1
})

In [4]:
class_0

Unnamed: 0,feature_1,feature_2,target
0,0.472986,0.283943,0
1,-0.681426,0.866102,0
2,0.242439,0.968661,0
3,-1.700736,0.395251,0
4,0.753143,1.036359,0
...,...,...,...
895,0.561457,-1.474740,0
896,0.511248,1.036611,0
897,-1.315289,2.092834,0
898,1.095720,-0.436764,0


In [5]:
class_1

Unnamed: 0,feature_1,feature_2,target
0,1.957996,-1.184722,1
1,-2.087900,1.799535,1
2,-0.722651,0.175456,1
3,-1.298794,0.361398,1
4,-0.702603,1.440687,1
...,...,...,...
95,-0.542259,-0.903506,1
96,-1.580046,-0.323730,1
97,0.929778,-2.539636,1
98,1.094100,0.645792,1


In [6]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [7]:
df

Unnamed: 0,feature_1,feature_2,target
0,0.472986,0.283943,0
1,-0.681426,0.866102,0
2,0.242439,0.968661,0
3,-1.700736,0.395251,0
4,0.753143,1.036359,0
...,...,...,...
995,-0.542259,-0.903506,1
996,-1.580046,-0.323730,1
997,0.929778,-2.539636,1
998,1.094100,0.645792,1


In [8]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.472986,0.283943,0
1,-0.681426,0.866102,0
2,0.242439,0.968661,0
3,-1.700736,0.395251,0
4,0.753143,1.036359,0


In [9]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,-0.542259,-0.903506,1
996,-1.580046,-0.32373,1
997,0.929778,-2.539636,1
998,1.0941,0.645792,1
999,1.368448,-0.677329,1


In [10]:
df["target"].value_counts()

0    900
1    100
Name: target, dtype: int64

## upsampling

In [11]:
# upsampling
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [12]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

In [13]:
df_minority_upsampled.shape

(900, 3)

In [14]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,0.053792,1.045723,1
992,0.526108,-0.390821,1
914,0.93073,0.180955,1
971,-0.056518,0.353833,1
960,0.74841,0.286741,1


In [15]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [16]:
df_upsampled["target"].value_counts()

0    900
1    900
Name: target, dtype: int64

##  Downsampling

In [23]:
import numpy as np
import pandas as pd

np.random.seed(123)

n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
            "feature1": np.random.normal(loc=0, scale=1, size=n_class_0),
            "feature2": np.random.normal(loc=0, scale=1, size=n_class_0),
            "target": [0] * n_class_0
})


class_1 = pd.DataFrame({
            "feature1": np.random.normal(loc=0, scale=1, size=n_class_1),
            "feature2": np.random.normal(loc=0, scale=1, size=n_class_1),
            "target": [1] * n_class_1
})



In [24]:
class_0

Unnamed: 0,feature1,feature2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


In [25]:
class_1

Unnamed: 0,feature1,feature2,target
0,-0.300232,0.139033,1
1,-0.632261,0.025577,1
2,-0.204317,-0.196443,1
3,0.213696,1.312255,1
4,1.033878,1.187417,1
...,...,...,...
95,-0.623629,0.845701,1
96,0.239810,-1.119923,1
97,-0.868240,-0.359297,1
98,0.902006,-1.609695,1


In [36]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [37]:
df

Unnamed: 0,feature1,feature2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
995,-0.623629,0.845701,1
996,0.239810,-1.119923,1
997,-0.868240,-0.359297,1
998,0.902006,-1.609695,1


In [38]:
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [39]:
from sklearn.utils import resample

df_majority_downsampled = resample(
            df_majority, replace=False, n_samples=len(df_minority), random_state=42
)

In [40]:
df_majority_downsampled.shape

(100, 3)

In [41]:
df_downsampled = pd.concat([df_minority, df_majority_downsampled])

In [42]:
df_downsampled.target.value_counts()

1    100
0    100
Name: target, dtype: int64

In [43]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [44]:
df_upsampled.target.value_counts()

0    900
1    900
Name: target, dtype: int64

## downsampling is bad bcz we loose data points