In [237]:
import pandas as pd                  # A fundamental package for linear algebra and multidimensional arrays
import numpy as np                   # Data analysis and data manipulating tool
import random                        # Library to generate random numbers
from collections import Counter      # Collection is a Python module that implements specialized container datatypes providing 
                                     # alternatives to Python’s general purpose built-in containers, dict, list, set, and tuple.
                                     # Counter is a dict subclass for counting hashable objects
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warnings in the notebook
import warnings
warnings.filterwarnings("ignore")

In [62]:
fraud_data = pd.read_csv(r'D:\hakk study\Data Science\Imbalanced dataset\Imbalanced_classes-master\fraud_data.csv')

In [65]:
fraud_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2994681,0,242834,25.0,H,9803,583.0,150.0,visa,226.0,...,firefox 56.0,24.0,1920x1080,match_status:2,T,F,T,T,desktop,rv:56.0
1,3557242,0,15123000,117.0,W,7919,194.0,150.0,mastercard,166.0,...,,,,,,,,,,
2,3327470,0,8378575,73.773,C,12778,500.0,185.0,mastercard,224.0,...,,,,,,,,,,
3,3118781,0,2607840,400.0,R,12316,548.0,150.0,visa,195.0,...,mobile safari generic,32.0,1136x640,match_status:2,T,F,T,F,mobile,iOS Device
4,3459772,0,12226544,31.95,W,9002,453.0,150.0,visa,226.0,...,,,,,,,,,,


In [None]:
fraud_data.info()

In [None]:
fraud_data.isFraud.value_counts().head(20)

In [None]:
fraud_data.isFraud.value_counts()/ len(fraud_data) * 100

In [None]:
sns.countplot(fraud_data.isFraud)

In [88]:
#Missing value in percentage
fraud_data.isnull().sum()/ len(fraud_data) * 100

TransactionID      0.000000
isFraud            0.000000
TransactionDT      0.000000
TransactionAmt     0.000000
ProductCD          0.000000
                    ...    
id_36             75.945745
id_37             75.945745
id_38             75.945745
DeviceType        75.979612
DeviceInfo        79.813391
Length: 434, dtype: float64

In [99]:
#Eliminating column which has more than 20% missing value

fraud_data = fraud_data[fraud_data.columns[fraud_data.isnull().mean() < 0.2]]

In [101]:
fraud_data.isnull().sum()/ len(fraud_data) * 100

TransactionID     0.00000
isFraud           0.00000
TransactionDT     0.00000
TransactionAmt    0.00000
ProductCD         0.00000
                   ...   
V317              0.00508
V318              0.00508
V319              0.00508
V320              0.00508
V321              0.00508
Length: 182, dtype: float64

In [115]:
num_cols = fraud_data.select_dtypes(include=np.number).columns      # getting all the numerical columns
fraud_data.select_dtypes

<bound method DataFrame.select_dtypes of        TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0            2994681        0         242834          25.000         H   9803   
1            3557242        0       15123000         117.000         W   7919   
2            3327470        0        8378575          73.773         C  12778   
3            3118781        0        2607840         400.000         R  12316   
4            3459772        0       12226544          31.950         W   9002   
...              ...      ...            ...             ...       ...    ...   
59049        3549150        0       14898396         335.000         W   2616   
59050        3497600        0       13371647          20.950         W   3277   
59051        3575755        0       15775109          59.000         W   9486   
59052        3521246        0       14071016          25.950         W   2392   
59053        3483662        0       13012549          59.000        

In [106]:
num_cols

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1',
       'card2', 'card3', 'card5', 'addr1', 'addr2',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=178)

In [135]:
fraud_data[num_cols] = fraud_data[num_cols].fillna(fraud_data[num_cols].mean().iloc[0]) # fills the missing values with mean


In [136]:
#Getting all Categorical column

cat_cols = fraud_data.select_dtypes(include ='object').columns
cat_cols

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain'], dtype='object')

In [165]:
fraud_data.isnull().sum()/ len(fraud_data) * 100


TransactionID                 0.0
isFraud                       0.0
TransactionDT                 0.0
TransactionAmt                0.0
card1                         0.0
                             ... 
P_emaildomain_yahoo.com.mx    0.0
P_emaildomain_yahoo.de        0.0
P_emaildomain_yahoo.es        0.0
P_emaildomain_yahoo.fr        0.0
P_emaildomain_ymail.com       0.0
Length: 250, dtype: float64

In [166]:
#get_dummies()
fraud_data = pd.get_dummies(fraud_data, columns=cat_cols)

KeyError: "None of [Index(['ProductCD', 'card4', 'card6', 'P_emaildomain'], dtype='object')] are in the [columns]"

In [167]:
fraud_data.C1

0          1
1          1
2          1
3          1
4          3
        ... 
59049      3
59050    118
59051      1
59052      4
59053      1
Name: C1, Length: 59054, dtype: int64

In [168]:
fraud_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,P_emaildomain_web.de,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com
0,2994681,0,242834,25.0,9803,583.0,150.0,226.0,269.0,87.0,...,0,0,0,0,1,0,0,0,0,0
1,3557242,0,15123000,117.0,7919,194.0,150.0,166.0,181.0,87.0,...,0,0,0,0,0,0,0,0,0,0
2,3327470,0,8378575,73.773,12778,500.0,185.0,224.0,284.0,60.0,...,0,0,0,0,0,0,0,0,0,0
3,3118781,0,2607840,400.0,12316,548.0,150.0,195.0,441.0,87.0,...,0,0,0,0,0,0,0,0,0,0
4,3459772,0,12226544,31.95,9002,453.0,150.0,226.0,264.0,87.0,...,0,0,0,0,1,0,0,0,0,0


In [173]:
# Separate input features and output feature

X = fraud_data.drop(columns = ['isFraud'])  #input features
Y = fraud_data.isFraud  #output features

In [177]:
fraud_data.TransactionAmt.max()

5279.95

In [183]:
#Standardization-/-Normalization
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data = scaled_features)
scaled_features.columns = X.columns
scaled_features.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,P_emaildomain_web.de,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com
0,-1.688548,-1.544958,-0.468203,-0.02194,1.412632,-0.286861,0.653753,-0.225982,0.077832,-0.099186,...,-0.021387,-0.022918,-0.009202,-0.004115,2.216281,-0.053413,-0.013649,-0.014839,-0.015399,-0.06253
1,1.615662,1.681426,-0.07354,-0.406928,-1.078794,-0.286861,-0.804662,-1.144356,0.077832,-0.099186,...,-0.021387,-0.022918,-0.009202,-0.004115,-0.451206,-0.053413,-0.013649,-0.014839,-0.015399,-0.06253
2,0.266093,0.21907,-0.258976,0.585989,0.881042,2.788641,0.605139,-0.069441,-10.788933,-0.099186,...,-0.021387,-0.022918,-0.009202,-0.004115,-0.451206,-0.053413,-0.013649,-0.014839,-0.015399,-0.06253
3,-0.959645,-1.032167,1.140478,0.491581,1.188468,-0.286861,-0.099761,1.569022,0.077832,-0.099186,...,-0.021387,-0.022918,-0.009202,-0.004115,-0.451206,-0.053413,-0.013649,-0.014839,-0.015399,-0.06253
4,1.043171,1.053404,-0.438389,-0.185621,0.580022,-0.286861,0.653753,-0.278162,0.077832,-0.082944,...,-0.021387,-0.022918,-0.009202,-0.004115,2.216281,-0.053413,-0.013649,-0.014839,-0.015399,-0.06253


In [187]:
#Splitting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42)

# X_train: independent feature data for training the model
# Y_train: dependent feature data for training the model
# X_test: independent feature data for testing the model; will be used to predict the target values
# Y_test: original target values of X_test; We will compare this values with our predicted values.
 
# test_size = 0.3: 30% of the data will go for test set and 70% of the data will go for train set


In [190]:
#Dealing with Imbalanced data
#Resampling Techniques - Oversample Minority class

from sklearn.utils import resample

In [193]:
train_data = pd.concat([X_train,Y_train], axis=1)
train_data

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com,isFraud
20911,3145586,3331547,107.95,1214,523.0,150.0,226.0,123.0,87.0,1,...,0,0,0,0,0,0,0,0,0,0
4756,3552524,14999631,59.00,10112,360.0,150.0,166.0,184.0,87.0,3,...,0,0,0,0,0,0,0,0,0,0
50447,3480494,12871254,39.00,1762,161.0,150.0,226.0,441.0,87.0,1,...,0,0,0,1,0,0,0,0,0,0
13104,3232758,5853279,113.00,17188,321.0,150.0,226.0,299.0,87.0,2,...,0,0,0,1,0,0,0,0,0,0
15845,3118652,2603679,50.00,2455,321.0,150.0,226.0,299.0,87.0,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,3559497,15200698,97.00,9480,170.0,150.0,226.0,123.0,87.0,1,...,0,0,0,0,0,0,0,0,0,0
38158,3126034,2838958,49.00,16709,275.0,150.0,224.0,264.0,87.0,14,...,0,0,0,1,0,0,0,0,0,0
860,3537687,14558280,156.00,8431,269.0,150.0,224.0,315.0,87.0,2,...,0,0,0,1,0,0,0,0,0,0
15795,3212955,5342137,117.00,12598,111.0,150.0,166.0,299.0,87.0,1,...,0,0,0,0,0,0,0,0,0,0


In [203]:
not_fraud = train_data[train_data.isFraud == 0]
fraud = train_data[train_data.isFraud == 1]
fraud

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com,isFraud
20265,3217422,5453332,100.000,14259,111.0,150.0,226.0,330.000000,87.000000,5,...,0,0,0,0,0,0,0,0,0,1
55428,3491743,13212882,82.000,2695,215.0,150.0,226.0,387.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
48677,3295602,7675362,82.171,1976,545.0,185.0,102.0,290.653939,86.806616,2,...,0,0,0,0,0,0,0,0,0,1
53522,3215504,5419064,422.500,10112,360.0,150.0,166.0,325.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
8600,3443380,11664175,500.000,11303,277.0,144.0,137.0,536.000000,96.000000,2,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37065,3490037,13188536,141.000,16083,555.0,150.0,226.0,299.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
1585,2991305,164197,59.000,7861,494.0,150.0,226.0,512.000000,87.000000,4,...,0,0,0,0,0,0,0,0,0,1
2612,3544407,14751756,500.000,6816,464.0,150.0,226.0,177.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
54886,3181016,4376025,300.000,10361,514.0,150.0,159.0,325.000000,87.000000,10,...,0,0,0,0,0,0,0,0,0,1


In [213]:
fraud_upsampled = resample(fraud, replace = True, n_samples = len(not_fraud), random_state = 27 )
fraud_upsampled

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com,isFraud
42596,3197116,4847294,49.672,11201,103.0,185.0,226.0,290.653939,86.806616,211,...,0,0,0,0,0,0,0,0,0,1
5678,3252526,6420106,200.000,10229,276.0,150.0,226.0,191.000000,87.000000,10,...,0,0,0,0,0,0,0,0,0,1
12442,3314774,8054678,440.000,6019,583.0,150.0,226.0,231.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
6154,3564009,15353032,303.950,10023,111.0,150.0,226.0,325.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1
46422,3100917,2219558,125.000,1333,170.0,150.0,195.0,330.000000,87.000000,2,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8838,3544952,14763726,117.000,7585,553.0,150.0,226.0,269.000000,87.000000,8,...,0,0,0,1,0,0,0,0,0,1
36290,3150068,3451657,10.494,9633,296.0,185.0,138.0,290.653939,86.806616,1,...,0,0,0,0,0,0,0,0,0,1
615,3454386,12052318,30.950,6598,111.0,150.0,195.0,264.000000,87.000000,162,...,0,0,0,0,0,0,0,0,0,1
42777,3287891,7427495,500.000,8406,264.0,150.0,226.0,441.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,1


In [214]:
upsampled = pd.concat([not_fraud,fraud_upsampled])
upsampled.isFraud.value_counts()

1    39942
0    39942
Name: isFraud, dtype: int64

In [None]:
#Resampling Technique - UnderSample majority class

In [229]:
not_fraud_downsampled = resample(not_fraud, replace = False, n_samples = len(fraud), random_state = 27)
not_fraud_downsampled

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com,isFraud
27327,3159853,3734559,1298.000,10616,583.0,150.0,226.0,231.000000,87.000000,2,...,0,0,0,0,0,0,0,0,0,0
20134,3063438,1669402,47.950,3666,555.0,150.0,226.0,170.000000,87.000000,17,...,0,0,0,0,0,0,0,0,0,0
28303,3547755,14853697,125.000,15775,481.0,150.0,102.0,330.000000,87.000000,5,...,0,0,0,0,0,0,0,0,0,0
38758,3487311,13116975,139.950,9480,170.0,150.0,226.0,337.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,0
12011,3097527,2159784,26.950,7005,111.0,150.0,226.0,310.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28840,3151080,3471241,250.000,18084,111.0,150.0,195.0,299.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,0
15549,3519051,14002785,57.950,15651,417.0,150.0,226.0,330.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,0
57856,3129135,2924184,6.821,7794,266.0,185.0,102.0,290.653939,86.806616,1,...,0,0,0,0,0,0,0,0,0,0
2026,3089609,2057358,100.000,6521,399.0,150.0,198.0,126.000000,87.000000,1,...,0,0,0,0,0,0,0,0,0,0


In [249]:
downsampled = pd.concat([not_fraud_downsampled, fraud])
downsampled.isFraud.value_counts()

1    1395
0    1395
Name: isFraud, dtype: int64

In [278]:
pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in c:\users\administrator.laptop-obs-47\anaconda3\lib\site-packages (0.7.0)
Note: you may need to restart the kernel to use updated packages.


In [281]:
#Generate Synthetic Samples
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 25, ratio = 1.0)
    

ImportError: cannot import name '_to_object_array' from 'sklearn.utils' (C:\Users\Administrator.LAPTOP-OBS-47\anaconda3\lib\site-packages\sklearn\utils\__init__.py)

In [283]:
X_train, Y_train = sm.fit_sample(X_train, Y_train)

NameError: name 'sm' is not defined

In [282]:
np.unique(Y_train,return_counts=True)

(array([0, 1], dtype=int64), array([39942,  1395], dtype=int64))

#### Conclusion
That's it for this notebook. We learned handling missing values, one hot encoding, standardization / normalization, what is imbalanced class and three techniques to deal with imbalanced classes.