# Fraudulent Transaction Prediction

In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Reading the dataset
df1 = pd.read_csv("C:\\Users\\Dell\\Downloads\\Fraud.csv")

# 1. Data Cleaning

In [4]:
# View the dataset
df1.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [5]:
df1.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1048570,95,CASH_OUT,132557.35,C1179511630,479803.0,347245.65,C435674507,484329.37,616886.72,0,0
1048571,95,PAYMENT,9917.36,C1956161225,90545.0,80627.64,M668364942,0.0,0.0,0,0
1048572,95,PAYMENT,14140.05,C2037964975,20545.0,6404.95,M1355182933,0.0,0.0,0,0
1048573,95,PAYMENT,10020.05,C1633237354,90605.0,80584.95,M1964992463,0.0,0.0,0,0
1048574,95,PAYMENT,11450.03,C1264356443,80584.95,69134.92,M677577406,0.0,0.0,0,0


In [6]:
# Finding the shape
df1.shape

(1048575, 11)

In [7]:
df1.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [8]:
df1.type.unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [9]:
df1.type.value_counts()

CASH_OUT    373641
PAYMENT     353873
CASH_IN     227130
TRANSFER     86753
DEBIT         7178
Name: type, dtype: int64

In [10]:
# Creating new dataframe
df2 = df1.drop(['nameOrig', 'nameDest'], axis = 'columns')
df2.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [11]:
# Checking for the NA values
df2.isnull().sum()

step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [12]:
np.min(df2['amount'])

0.1

In [13]:
np.max(df2['amount'])

10000000.0

In [14]:
# Getting Dataset Information
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1048575 non-null  int64  
 1   type            1048575 non-null  object 
 2   amount          1048575 non-null  float64
 3   oldbalanceOrg   1048575 non-null  float64
 4   newbalanceOrig  1048575 non-null  float64
 5   oldbalanceDest  1048575 non-null  float64
 6   newbalanceDest  1048575 non-null  float64
 7   isFraud         1048575 non-null  int64  
 8   isFlaggedFraud  1048575 non-null  int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 72.0+ MB


In [15]:
# Checking the no. of legit transactions and fraudulent transaction
# 0 represents the normal transactions and 1 represents the fraudulent transactions
df2['isFraud'].value_counts()

0    1047433
1       1142
Name: isFraud, dtype: int64

In [16]:
# Separating the dataset for analysis
legit = df2[df2.isFraud == 0]
fraud = df2[df2.isFraud == 1]

In [17]:
print(legit.shape)
print(fraud.shape)

(1047433, 9)
(1142, 9)


In [18]:
# Statistical measurement of the dataset
legit.amount.describe()

count    1.047433e+06
mean     1.575397e+05
std      2.541883e+05
min      1.000000e-01
25%      1.213487e+04
50%      7.621497e+04
75%      2.134928e+05
max      6.419835e+06
Name: amount, dtype: float64

In [19]:
fraud.amount.describe()

count    1.142000e+03
mean     1.192629e+06
std      2.030599e+06
min      1.190000e+02
25%      8.607017e+04
50%      3.531794e+05
75%      1.248759e+06
max      1.000000e+07
Name: amount, dtype: float64

In [20]:
# Compare the values for both the transactions
df2.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26.942944,157539.7,873633.8,894746.39508,978732.769117,1114237.0,0.0
1,48.272329,1192629.0,1218636.0,33944.321208,452866.124527,1077940.0,0.0


# Under Sampling

In [None]:
# Build a sample dataset containing similar distribution of 
# normal transaction and fraudulent transaction

In [None]:
# No. of fraudulent transaction --> 8213

In [20]:
legit_sample = legit.sample(n=8213)

In [21]:
# Concatinating two dataframes
new_dataset = pd.concat([legit_sample, fraud], axis = 0)

In [22]:
# View the new_dataset
new_dataset.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
981021,44,PAYMENT,15490.2,60011.0,44520.8,0.0,0.0,0,0
870525,42,PAYMENT,5069.31,153.0,0.0,0.0,0.0,0,0
686471,36,CASH_OUT,315764.89,7.0,0.0,0.0,315764.89,0,0
655140,35,PAYMENT,7652.42,327941.48,320289.06,0.0,0.0,0,0
494556,20,CASH_OUT,15581.74,60644.37,45062.63,204709.0,0.0,0,0


In [23]:
new_dataset.tail()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1047888,95,CASH_OUT,56745.14,56745.14,0.0,51433.88,108179.02,1,0
1048221,95,TRANSFER,33676.59,33676.59,0.0,0.0,0.0,1,0
1048222,95,CASH_OUT,33676.59,33676.59,0.0,0.0,33676.59,1,0
1048323,95,TRANSFER,87999.25,87999.25,0.0,0.0,0.0,1,0
1048324,95,CASH_OUT,87999.25,87999.25,0.0,0.0,87999.25,1,0


In [24]:
# Checking the nos. of each data points in isFraud Category
new_dataset['isFraud'].value_counts()

0    8213
1    1142
Name: isFraud, dtype: int64

In [26]:
# Grouping the new_dataset and finding its mean
new_dataset.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26.872641,153551.3,897153.3,919806.783173,941678.634259,1070674.0,0.0
1,48.272329,1192629.0,1218636.0,33944.321208,452866.124527,1077940.0,0.0


# 2. Splitting the dataset into features and target to make a machine learning model of Fraud Detection of the transactions that are available on our dataset.

In [35]:
x = new_dataset.drop(columns=['isFraud','type'], axis = 1)
y = new_dataset['isFraud']

In [36]:
print(x)

         step     amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
981021     44   15490.20       60011.00        44520.80            0.00   
870525     42    5069.31         153.00            0.00            0.00   
686471     36  315764.89           7.00            0.00            0.00   
655140     35    7652.42      327941.48       320289.06            0.00   
494556     20   15581.74       60644.37        45062.63       204709.00   
...       ...        ...            ...             ...             ...   
1047888    95   56745.14       56745.14            0.00        51433.88   
1048221    95   33676.59       33676.59            0.00            0.00   
1048222    95   33676.59       33676.59            0.00            0.00   
1048323    95   87999.25       87999.25            0.00            0.00   
1048324    95   87999.25       87999.25            0.00            0.00   

         newbalanceDest  isFlaggedFraud  
981021             0.00               0  
870525         

In [37]:
print(y)

981021     0
870525     0
686471     0
655140     0
494556     0
          ..
1047888    1
1048221    1
1048222    1
1048323    1
1048324    1
Name: isFraud, Length: 9355, dtype: int64


# 3. Selecting variables that are to be included in our model

In [53]:
# Split the data into Training Data & Testing Data

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [39]:
# Finding the shape of x, x_train and x_test
print(x.shape, x_train.shape, x_test.shape)

(9355, 7) (7484, 7) (1871, 7)


# 4. Training the model using Logistic Regression 

In [40]:
# Using Logistic Regression for Binary Classification
model = LogisticRegression()

In [41]:
# Training the Logistic Regression model with training data
model.fit(x_train, y_train)

LogisticRegression()

# 5. Model Evaluation based on the accuracy score for predicting fraudulent customer

In [48]:
# Accuracy on training data
x_train_predict = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predict, y_train)

In [55]:
print("Accuracy score on training data: ", training_data_accuracy)

Accuracy score on training data:  0.9560395510422234


In [52]:
# Hence the accuracy score is 95.6 percent which is the very good accuracy score so 
# the prediction are good

In [53]:
# Accuracy on test data
x_test_predict = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_predict, y_test)

In [56]:
print("Accuracy score on test data : ", test_data_accuracy)

Accuracy score on test data :  0.9518973810796365


In [57]:
# Since there is no much difference between the accuracy score of test and training data
# so we can say that our model is not over fitted or under fitted

# 6. These factors do make sense because there is too less difference between the training data accuracy and test data accuracy.

# 7. The key to prevention is to detect it right at the stage of origination on a real time basis.  Machine learning (ML) algorithms offer an effective counter for fraud detection and prevention. Based on the learning from the historical patterns in data, current sets of transactions can be analysed before lending companies decide to proceed with a particular application.

# 8. As i stated earlier that by the historical patterns in data we can assume that we have to detect the problems on real time basis and for this our machine learning model is enough to determine that which type of transactions should be needed to prevent.