# Imbalanced Dataset 

## Technique :1 Up Sampling 


In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy.stats as stat 

In [4]:
np.random.seed(123) # to control that my random value wil not changed 


In [6]:
n_samples=1000 
class_ratio=0.9 
n_class_0=int(n_samples*class_ratio)
n_class_1=n_samples-n_class_0 


In [7]:
n_class_0,n_class_1

(900, 100)

In [8]:
# Create my dataframe with imbalanced dataset 
class_0=pd.DataFrame({
   'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
   'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

In [11]:
class_0['target'].value_counts()

0    900
Name: target, dtype: int64

In [13]:
class_1=pd.DataFrame({
   'feature_1': np.random.normal(loc=0, scale=1, size=n_class_1),
   'feature_2': np.random.normal(loc=0, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [14]:
class_1.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.774224,-0.58856,1
1,-1.201377,-0.049089,1
2,1.096257,-0.3417,1
3,0.861037,0.504139,1
4,-1.520367,1.567392,1


In [15]:
class_1['target'].value_counts()

1    100
Name: target, dtype: int64

In [16]:
class_1.shape

(100, 3)

In [17]:
class_0.shape

(900, 3)

In [26]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [27]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [29]:
df.tail(5)

Unnamed: 0,feature_1,feature_2,target
995,1.754312,0.519677,1
996,-0.717891,-1.576624,1
997,-0.823192,0.398328,1
998,1.535918,-0.11141,1
999,1.30774,0.060793,1


In [30]:
df.shape

(1000, 3)

In [33]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [34]:
df_minority=df[df['target']==1]
df_majority=df[df['target']==0] 

In [36]:
from sklearn.utils import resample   # library for resampling 

In [37]:
df_minority_upsampled=resample(df_minority,replace=True, #Sample With replacement
         n_samples=len(df_majority),
         random_state=42
        )

In [41]:
df_minority_upsampled.tail()

Unnamed: 0,feature_1,feature_2,target
952,0.068213,1.524047,1
965,-0.945837,0.477732,1
976,0.16448,-1.270221,1
942,-3.066988,-0.60806,1
974,-0.095412,-0.720391,1


In [44]:
df_upsampled=pd.concat([df_majority,df_minority_upsampled]).reset_index(drop=True)

In [46]:
df_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [47]:
df_upsampled.tail()

Unnamed: 0,feature_1,feature_2,target
1795,0.068213,1.524047,1
1796,-0.945837,0.477732,1
1797,0.16448,-1.270221,1
1798,-3.066988,-0.60806,1
1799,-0.095412,-0.720391,1


In [45]:
df_upsampled['target'].value_counts()

1    900
0    900
Name: target, dtype: int64

# ##Note : Upsamples increase the data volume 

# 2. Down sampling 

In [48]:
df_majority_downsampled=resample(df_majority,replace=False, #No replacement
         n_samples=len(df_minority),
         random_state=42
        )

In [49]:
df_majority_downsampled.shape

(100, 3)

In [51]:
df_downsampled=pd.concat([df_minority,df_majority_downsampled]).reset_index(drop=True)
df_downsampled.shape

(200, 3)

In [52]:
df_downsampled.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.774224,-0.58856,1
1,-1.201377,-0.049089,1
2,1.096257,-0.3417,1
3,0.861037,0.504139,1
4,-1.520367,1.567392,1


In [53]:
df_downsampled.tail()

Unnamed: 0,feature_1,feature_2,target
195,-0.168426,0.553775,0
196,-0.403366,0.081491,0
197,-0.269293,0.611238,0
198,-0.295829,0.671673,0
199,-0.553404,0.804438,0


In [54]:
df_downsampled['target'].value_counts()

1    100
0    100
Name: target, dtype: int64

### Downsampling decrease the data volume 