# Practice ML, Online Payment Fraud

https://thecleverprogrammer.com/2022/02/22/online-payments-fraud-detection-with-machine-learning/#google_vignette

In [76]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [56]:
data = pd.read_csv('PS_20174392719_1491204439457_log.CSV')

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [58]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [59]:
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [60]:
type = data.type.value_counts()
transactions = type.index
quantity = type.values

In [61]:
figure = px.pie(data, 
            values=quantity, 
            names=transactions,hole = 0.5, 
            title="Distribution of Transaction Type")
figure.show()

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [63]:
correlation = data.corr()
correlation['isFraud'].sort_values(ascending=False)

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

In [64]:
data['type'] = data['type'].map({'CASH_OUT':1, 'PAYMENT':2, 'CASH_IN':3, 'TRANSFER':4, 'DEBIT':5})
data.type

0          2
1          2
2          4
3          1
4          2
          ..
6362615    1
6362616    4
6362617    1
6362618    4
6362619    1
Name: type, Length: 6362620, dtype: int64

In [67]:
data['isFraud'] = data.isFraud.map({0:'No Fraud', 1:'Fraud'})
data['isFraud']

0          No Fraud
1          No Fraud
2             Fraud
3             Fraud
4          No Fraud
             ...   
6362615       Fraud
6362616       Fraud
6362617       Fraud
6362618       Fraud
6362619       Fraud
Name: isFraud, Length: 6362620, dtype: object

In [70]:
x = np.array(data[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']])
x

array([[2.00000000e+00, 9.83964000e+03, 1.70136000e+05, 1.60296360e+05],
       [2.00000000e+00, 1.86428000e+03, 2.12490000e+04, 1.93847200e+04],
       [4.00000000e+00, 1.81000000e+02, 1.81000000e+02, 0.00000000e+00],
       ...,
       [1.00000000e+00, 6.31140928e+06, 6.31140928e+06, 0.00000000e+00],
       [4.00000000e+00, 8.50002520e+05, 8.50002520e+05, 0.00000000e+00],
       [1.00000000e+00, 8.50002520e+05, 8.50002520e+05, 0.00000000e+00]])

In [75]:
y = np.array(data[['isFraud']])
y

array([['No Fraud'],
       ['No Fraud'],
       ['Fraud'],
       ...,
       ['Fraud'],
       ['Fraud'],
       ['Fraud']], dtype=object)

In [77]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=42)
xtrain, xtest, ytrain, ytest

(array([[1.00000000e+00, 1.32254090e+05, 5.98531000e+05, 4.66276910e+05],
        [3.00000000e+00, 7.11910500e+04, 1.53758019e+07, 1.54469929e+07],
        [4.00000000e+00, 8.71751200e+05, 4.77430000e+04, 0.00000000e+00],
        ...,
        [1.00000000e+00, 1.05791600e+04, 5.92790000e+04, 4.86998400e+04],
        [1.00000000e+00, 7.30207600e+04, 2.02890000e+04, 0.00000000e+00],
        [1.00000000e+00, 8.96988500e+04, 1.96071000e+05, 1.06372150e+05]]),
 array([[3.0000000e+00, 3.3021842e+05, 2.0866000e+04, 3.5108442e+05],
        [2.0000000e+00, 1.1647080e+04, 3.0370000e+04, 1.8722920e+04],
        [3.0000000e+00, 1.5226421e+05, 1.0658900e+05, 2.5885321e+05],
        ...,
        [2.0000000e+00, 1.1321010e+04, 0.0000000e+00, 0.0000000e+00],
        [2.0000000e+00, 2.7916800e+03, 5.8727000e+04, 5.5935320e+04],
        [3.0000000e+00, 4.6042024e+05, 3.2541549e+05, 7.8583573e+05]]),
 array([['No Fraud'],
        ['No Fraud'],
        ['No Fraud'],
        ...,
        ['No Fraud'],
     

In [79]:
model = DecisionTreeClassifier()
print(model.fit(xtrain, ytrain))
print(model.score(xtest,ytest))

DecisionTreeClassifier()
0.9997343861491021


In [81]:
# prediction
# features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))

['Fraud']
