In [1]:
import numpy as np
import pandas as pd

In [2]:
## set the random seed for reproducility
np.random.seed(123)

In [3]:
## create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [4]:
n_class_0, n_class_1

(900, 100)

In [5]:
## create my dataframe with imbalanced dataset
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc=2,scale=1,size = n_class_0),
    'feature_2':np.random.normal(loc=0,scale=1,size = n_class_0),
    'target' : [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc=2,scale=1,size = n_class_1),
    'feature_2':np.random.normal(loc=0,scale=1,size = n_class_1),
    'target' : [1] * n_class_1
})

In [6]:
class_0, class_1

(     feature_1  feature_2  target
 0     0.914369   0.551302       0
 1     2.997345   0.419589       0
 2     2.282978   1.815652       0
 3     0.493705  -0.252750       0
 4     1.421400  -0.292004       0
 ..         ...        ...     ...
 895   2.238761  -0.003155       0
 896   0.893614  -0.430660       0
 897   2.366732  -0.146416       0
 898   3.023906   1.160176       0
 899   1.789944  -0.641512       0
 
 [900 rows x 3 columns],
     feature_1  feature_2  target
 0    1.699768   0.139033       1
 1    1.367739   0.025577       1
 2    1.795683  -0.196443       1
 3    2.213696   1.312255       1
 4    3.033878   1.187417       1
 ..        ...        ...     ...
 95   1.376371   0.845701       1
 96   2.239810  -1.119923       1
 97   1.131760  -0.359297       1
 98   2.902006  -1.609695       1
 99   2.697490   0.013570       1
 
 [100 rows x 3 columns])

In [7]:
df = pd.concat([class_0,class_1]).reset_index(drop = True)
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.914369,0.551302,0
1,2.997345,0.419589,0
2,2.282978,1.815652,0
3,0.493705,-0.25275,0
4,1.4214,-0.292004,0


In [8]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

## Up Sampling

-> good compare to down sampling because we don't lose any datapoints here

In [9]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [10]:
from sklearn.utils import resample

## create new different samples which is length of df_majority(900)
df_minority_upsampled = resample(df_minority, # on which dataframe/dataset you want to perform up sampling
                                replace = True, # replace = true -> added new sample with replacement
                                n_samples = len(df_majority), # how many samples do you want with upsampling( = df_majority)
                                random_state = 42
                        )

In [11]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.125854,-0.156083,1
992,2.19657,-0.602575,1
914,1.93217,0.998053,1
971,2.272825,1.034197,1
960,2.870056,-0.449515,1


In [12]:
## concatenate df_majority with df_minority_upsampled to make balanced dataset
df_upsampled = pd.concat([df_majority,df_minority_upsampled]).reset_index(drop = True)
df_upsampled ## balanced dataset with 900-900 samples for both category
## which is the main dataset now onwards

Unnamed: 0,feature_1,feature_2,target
0,0.914369,0.551302,0
1,2.997345,0.419589,0
2,2.282978,1.815652,0
3,0.493705,-0.252750,0
4,1.421400,-0.292004,0
...,...,...,...
1795,1.188902,0.189189,1
1796,3.919526,-0.019459,1
1797,2.810326,1.604614,1
1798,3.621531,0.168229,1


## Down Sampling

-> not a goood thing because we lose datapoints 

In [13]:
from sklearn.utils import resample

## decrease the no of samples to make it in the length of df_minority(100)
df_majority_downsampled = resample(df_majority, # on which dataframe/dataset you want to perform down sampling
                                replace = False, # replace = False -> for down sampling
                                n_samples = len(df_minority), # how many samples do you want with downsampling( = dF_minority)
                                random_state = 42
                        )

In [14]:
df_majority_downsampled

Unnamed: 0,feature_1,feature_2,target
70,2.468439,1.720920,0
827,3.089165,-0.464899,0
231,2.753869,-0.969798,0
588,2.588686,-0.704720,0
39,2.283627,1.012868,0
...,...,...,...
398,1.831574,0.553775,0
76,1.596634,0.081491,0
196,1.730707,0.611238,0
631,1.704171,0.671673,0


In [15]:
## concatenate df_minority with df_majority_downsampled to make balanced dataset
df_downsampled = pd.concat([df_minority,df_majority_downsampled]).reset_index(drop = True)
df_downsampled ## balanced dataset with 100-100 samples for both category
## which is the main dataset now onwards

Unnamed: 0,feature_1,feature_2,target
0,1.699768,0.139033,1
1,1.367739,0.025577,1
2,1.795683,-0.196443,1
3,2.213696,1.312255,1
4,3.033878,1.187417,1
...,...,...,...
195,1.831574,0.553775,0
196,1.596634,0.081491,0
197,1.730707,0.611238,0
198,1.704171,0.671673,0


## Balanced Dataset

In [16]:
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,0.914369,0.551302,0
1,2.997345,0.419589,0
2,2.282978,1.815652,0
3,0.493705,-0.252750,0
4,1.421400,-0.292004,0
...,...,...,...
1795,1.188902,0.189189,1
1796,3.919526,-0.019459,1
1797,2.810326,1.604614,1
1798,3.621531,0.168229,1


In [17]:
df_downsampled

Unnamed: 0,feature_1,feature_2,target
0,1.699768,0.139033,1
1,1.367739,0.025577,1
2,1.795683,-0.196443,1
3,2.213696,1.312255,1
4,3.033878,1.187417,1
...,...,...,...
195,1.831574,0.553775,0
196,1.596634,0.081491,0
197,1.730707,0.611238,0
198,1.704171,0.671673,0
