In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
credit_data = pd.read_csv("creditcard.csv")

In [3]:
fraud_data = pd.read_csv("PS_20174392719_1491204439457_log.csv")

In [4]:
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
fraud_data.groupby(["type", 'isFraud']).count()['step']

type      isFraud
CASH_IN   0          1399284
CASH_OUT  0          2233384
          1             4116
DEBIT     0            41432
PAYMENT   0          2151495
TRANSFER  0           528812
          1             4097
Name: step, dtype: int64

In [6]:
fraud_data['isTransfer'] = np.where(fraud_data['type'] == 'TRANSFER', 1, 0)
fraud_data['isCash_out'] = np.where(fraud_data['type'] == 'CASH_OUT', 1, 0)

In [7]:
fraud_data["dest_type"] = fraud_data.nameDest.astype(str).str[0]

In [9]:
fraud_data.groupby(["dest_type", 'isFraud']).agg({'amount': ['mean', 'std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,amount
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
dest_type,isFraud,Unnamed: 2_level_2,Unnamed: 3_level_2
C,0,262732.9,718535.4
C,1,1467967.0,2404253.0
M,0,13057.6,12556.45


In [10]:
fraud_data['balance_diff'] = fraud_data.newbalanceOrig - \
    fraud_data.oldbalanceOrg

In [13]:
groupby_data = fraud_data.groupby(['type', 'nameOrig'])['balance_diff'].mean()

In [12]:
fraud_data.groupby(['type', 'nameOrig'])['amount'].mean()

type      nameOrig   
CASH_IN   C1000002591    261877.19
          C1000003372     20528.65
          C1000003615     49360.77
          C1000005789     20339.32
          C1000006646    111717.99
                           ...    
TRANSFER  C999985598     231011.32
          C999986955     675316.04
          C999988902     377965.32
          C999989413      48713.45
          C999989921     901542.09
Name: amount, Length: 6359874, dtype: float64

In [15]:
fraud_data.merge(groupby_data, left_on=['type', 'nameOrig'], right_on=[
                 'type', 'nameOrig'], how='left')

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isTransfer,isCash_out,dest_type,balance_diff_x,balance_diff_y
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0,0,0,M,-9839.64,-9839.64
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0,0,0,M,-1864.28,-1864.28
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0,1,0,C,-181.00,-181.00
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0,0,1,C,-181.00,-181.00
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0,0,0,M,-11668.14,-11668.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0,0,1,C,-339682.13,-339682.13
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0,1,0,C,-6311409.28,-6311409.28
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0,0,1,C,-6311409.28,-6311409.28
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0,1,0,C,-850002.52,-850002.52
