# **Handling Imbalanced Dataset**

An Imbalanced Dataset are dataset in which the number of samples belonging to each class or category is not equal. Some classes have larger number of samples than the others.
This problem can lead to false prediction of the model as the model is baised to those categories which have the large number of samples.
To solve these problems there are two techniques:
1. Up sampling
2. Down sampling

In [1]:
import numpy as np
import pandas as pd

np.random.seed(321)

n_sample = 1000
class0_ratio = 0.9
n_class0 = int(n_sample * class0_ratio)     #900 data points
n_class1 = n_sample - n_class0              #100 data points

n_class0,n_class1


(900, 100)

In [2]:
# Create the dataframe with imbalance dataset

class_0 = pd.DataFrame({
    'Feature_1':np.random.normal(loc=0,scale=1,size=n_class0),
    'Feature_2':np.random.normal(loc=0,scale=1,size=n_class0),
    'Target': [0] * n_class0
})

class_1 = pd.DataFrame({
    'Feature_1':np.random.normal(loc=0,scale=1,size=n_class1),
    'Feature_2':np.random.normal(loc=0,scale=1,size=n_class1),
    'Target': [1] * n_class1
})


In [3]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [5]:
df

Unnamed: 0,Feature_1,Feature_2,Target
0,0.172519,0.297040,0
1,1.635483,2.015559,0
2,0.037336,1.116748,0
3,-0.884150,1.416398,0
4,-1.143192,0.870548,0
...,...,...,...
995,-0.202530,0.420751,1
996,0.593783,0.285815,1
997,-2.378084,0.040056,1
998,0.852989,2.365325,1


In [13]:
df['Target'].value_counts()

0    900
1    100
Name: Target, dtype: int64

# UpSampling

In [7]:
df_minority = df[df['Target'] == 1]
df_majority = df[df['Target'] == 0]

In [9]:
df_minority.shape

(100, 3)

In [8]:
from sklearn.utils import resample
df_minority_updated = resample(df_minority,replace = True,n_samples=len(df_majority),random_state=42)
# df_minority --> the sample which we want to resample
# replace --> Whenever we are resampling it will be done by replacement.
# n_sample --> size of the sample
# random state --> constant value so that the seed is fixed

In [8]:
df_minority_updated.shape

(900, 3)

In [12]:
df_minority_updated

Unnamed: 0,Feature_1,Feature_2,Target
951,-0.452024,0.742970,1
992,-1.186699,0.991730,1
914,0.335986,1.021995,1
971,-0.800629,-0.128517,1
960,0.541342,0.461438,1
...,...,...,...
952,-0.141840,-0.861340,1
965,-0.642254,-0.685350,1
976,-0.296097,0.300597,1
942,-0.237574,0.383892,1


In [10]:
df_updated = pd.concat([df_majority,df_minority_updated])

In [11]:
df_updated['Target'].value_counts()      #Re-sampled

0    900
1    900
Name: Target, dtype: int64

In [52]:
df_updated.tail()

Unnamed: 0,Feature_1,Feature_2,Target
952,-0.14184,-0.86134,1
965,-0.642254,-0.68535,1
976,-0.296097,0.300597,1
942,-0.237574,0.383892,1
974,0.808526,0.044897,1


# Down Sampling

In [14]:
np.random.seed(159)

sample = 5000
class_1_ratio = 0.2
class_1 = int(class_1_ratio * sample)
class_2 = sample - class_1

In [15]:
class_1,class_2

(1000, 4000)

In [16]:
# Creating the dataset

class1 = pd.DataFrame({
    'A':np.random.normal(loc=0,scale=1,size=class_1),
    'B':np.random.normal(loc=0,scale=1,size=class_1),
    'Target':[0] * class_1
})
class2 = pd.DataFrame({
    'A':np.random.normal(loc=1,scale=1,size=class_2),
    'B':np.random.normal(loc=1,scale=1,size=class_2),
    'Target':[1] * class_2
})

In [17]:
class1.shape,class2.shape

((1000, 3), (4000, 3))

In [18]:
df1 = pd.concat([class1,class2]).reset_index(drop = True)

In [19]:
df1.tail()

Unnamed: 0,A,B,Target
4995,0.568829,1.815757,1
4996,0.629208,1.070227,1
4997,2.057189,0.759124,1
4998,1.847973,1.391399,1
4999,0.861621,1.718557,1


In [20]:
df1['Target'].value_counts()

1    4000
0    1000
Name: Target, dtype: int64

In [27]:
df_maj = df1[df1['Target'] == 1]
df_min = df1[df1['Target'] == 0]

In [33]:
from sklearn.utils import resample
df_majority_updated = resample(df_maj,replace = False,n_samples=len(df_min),random_state=42)
# replace --> It wiil be false as we are reducing the data point to solve the imbalance


In [29]:
df_majority_updated.shape,df_min.shape

((1000, 3), (1000, 3))

In [30]:
df_downsampled = pd.concat([df_majority_updated,df_min])

In [31]:
df_downsampled.Target.value_counts()

1    1000
0    1000
Name: Target, dtype: int64

In [42]:
df_downsampled.head()

Unnamed: 0,A,B,Target
1555,1.343993,0.453085,1
4491,1.936511,1.737668,1
1527,-0.227424,1.684011,1
4925,1.181088,1.800313,1
3989,0.957344,2.18931,1
