# Imbalanced Dataset
- ### A dataset with unequal class distribution

In [1]:
import numpy as np
import pandas as pd

In [2]:
credit_card_df = pd.read_csv('credit_data.csv')
credit_card_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
credit_card_df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# distribution of the two classes [0->legit, 1->fraud]
credit_card_df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

Highly imbalanced dataset

In [6]:
legit_transactions = credit_card_df[credit_card_df.Class == 0]
fraudulent_transactions = credit_card_df[credit_card_df.Class == 1]

In [7]:
legit_transactions.shape, fraudulent_transactions.shape

((284315, 31), (492, 31))

### Undersampling
Build a sample dataset containing similar distribution of Legit and Fraudulent transactions

In [8]:
legit_sample = legit_transactions.sample(n=492) # 492 random records
legit_sample.shape

(492, 31)

Concatenate the two dataframes

In [9]:
new_df = pd.concat([legit_sample, fraudulent_transactions], axis=0) # add row wise

In [10]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
28088,34851.0,-1.134569,1.039712,1.664143,-0.161296,0.297783,-0.70664,1.299433,-0.464844,0.132868,...,-0.142123,0.0027,-0.242997,0.377487,0.363658,0.359169,-0.252751,-0.241202,22.0,0
15463,26849.0,-2.69342,2.337264,-0.303612,2.352796,0.091536,-0.544235,1.281668,-0.459579,0.769469,...,-0.403418,0.522401,-0.136038,0.515889,-0.678903,0.004672,0.717001,0.307317,56.27,0
170826,120372.0,0.107479,1.272812,-1.934159,-0.466879,1.230858,-1.197636,1.165312,0.044588,-0.578391,...,0.177879,0.492414,-0.099265,0.440287,-0.32921,0.527895,-0.09179,0.037436,13.56,0
42244,41012.0,-2.241933,-5.287394,-2.147628,2.58382,-1.728497,-0.530352,2.993815,-0.849004,-0.547925,...,1.110897,-0.865194,-1.755071,0.509848,0.195926,-0.491947,-0.349761,0.328911,1823.2,0
114652,73564.0,-0.775932,1.270756,1.020605,0.144648,-0.293701,-0.765563,0.267848,0.493036,-0.532631,...,-0.170273,-0.559881,0.131636,0.336963,-0.262931,0.08462,0.132704,0.031115,10.99,0


In [11]:
new_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [12]:
new_df.Class.value_counts()

0    492
1    492
Name: Class, dtype: int64