In [15]:
# Importing Dependencies
import numpy as np
import pandas as pd #To use dataframes to get structured data for analysis
from sklearn.model_selection import train_test_split #to spli our data into training data and split data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score #to check accuracy of the model
from sklearn.preprocessing import LabelEncoder

In [3]:
#Loading Dataset to pandas dataframe
Dataset = pd.read_csv("PS_20174392719_1491204439457_log.csv")

In [16]:
# Converting non numerical data type to numerical data
le = LabelEncoder()
Dataset.type = le.fit_transform(Dataset.type)

In [17]:
# First 5 rows of the dataset
Dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,3,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,3,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
# FLast 5 rows of the dataset
Dataset.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [19]:
# Information about the Dataset
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            int64  
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 534.0+ MB


In [7]:
# Check the number of missing values in each column
Dataset.isnull().sum()
# We don't have missing values here, If we had missing values we should do imputation

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [8]:
# Distribution of 0->Legit Transaction and 1-> Fraudulent Transaction
Dataset['isFraud'].value_counts()
# Here we have 6354407 Legit Transaction nad 8213 Fraudulent Transaction, Hence we can say this is an unbalanced data, so if we design our model by this dataset it might not give output for fraudulent transactions

0    6354407
1       8213
Name: isFraud, dtype: int64

In [20]:
# separating the data for analysis
legit = Dataset[Dataset.isFraud == 0] # rows with legit transactions
fraud = Dataset[Dataset.isFraud == 1] # rows with fraud transactions
print(legit.shape) # shape returns number of rows and number of columns
print(fraud.shape)

(6354407, 11)
(8213, 11)


In [10]:
# Statistical measures of the data
legit.amount.describe()

count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

In [11]:
fraud.amount.describe()

count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64

In [13]:
# Compare the values for both transactions
Dataset.groupby('isFraud').mean() # group the values based on isFraud value

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.235663,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [21]:
# Taking a sample of Legit data using Under-Sampling to get similar distribution of both legit and fraud transactions
# Number of Fraudulent Transaction = 8213
legit_sample = legit.sample(n=8213)# this will extract 8213 datpoints randomly

In [22]:
# Concatenate legit_sample and fraud dataframes
new_Dataset = pd.concat([legit_sample,fraud], axis = 0) # axis= 0 -> add datapoints rowwise

In [23]:
new_Dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
570486,23,0,65254.58,C292923965,4856961.44,4922216.02,C1991949362,166176.5,100921.91,0,0
3933915,285,3,199.0,C1810471426,108172.0,107973.0,M984409622,0.0,0.0,0,0
662901,36,3,34991.79,C125190169,95676.0,60684.21,M1474854530,0.0,0.0,0,0
1653375,158,3,3806.59,C491792361,41056.0,37249.41,M680622643,0.0,0.0,0,0
2749406,212,1,102171.33,C430987688,0.0,0.0,C1238442065,1224270.31,1326441.64,0,0


In [24]:
new_Dataset.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,1,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,4,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,1,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,4,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,1,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [9]:
# Check again for value count
new_Dataset['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [25]:
# Splitting the data into features and targets(either 0 or 1)
x = new_Dataset.drop(columns=['isFraud','nameOrig','nameDest'],axis=1) # drops the the column isFraud and adds other columns to x
y = new_Dataset['isFraud']
print(x)

         step  type      amount  oldbalanceOrg  newbalanceOrig  \
570486     23     0    65254.58     4856961.44      4922216.02   
3933915   285     3      199.00      108172.00       107973.00   
662901     36     3    34991.79       95676.00        60684.21   
1653375   158     3     3806.59       41056.00        37249.41   
2749406   212     1   102171.33           0.00            0.00   
...       ...   ...         ...            ...             ...   
6362615   743     1   339682.13      339682.13            0.00   
6362616   743     4  6311409.28     6311409.28            0.00   
6362617   743     1  6311409.28     6311409.28            0.00   
6362618   743     4   850002.52      850002.52            0.00   
6362619   743     1   850002.52      850002.52            0.00   

         oldbalanceDest  newbalanceDest  isFlaggedFraud  
570486        166176.50       100921.91               0  
3933915            0.00            0.00               0  
662901             0.00          

In [23]:
print(y)

5631616    0
3643337    0
4202010    0
1121257    0
5522812    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


In [26]:
# Split the data into training data and testing data
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2,stratify = y, random_state=2) # x has features and y has label, 0.2(20%) of the x data is stored in X_test and its correspoding label is stored in Y_test and 80% of the x data is stored in X_train and its crresponding label is stored in Y_train
# stratify is used to maintain similar distribution of data, random_state to split in some random way

In [27]:
print(x.shape,X_train.shape,X_test.shape,Y_train.shape)

(16426, 8) (13140, 8) (3286, 8) (13140,)


In [40]:
Y_test

5776528    1
1910481    0
6271904    0
1290031    0
204956     0
          ..
1044989    1
3749432    1
5171586    1
2386396    0
2183605    0
Name: isFraud, Length: 3286, dtype: int64

In [30]:
# Model Training - Logistic Regression
model = LogisticRegression()
model.fit(X_train,Y_train)

LogisticRegression()

In [33]:
# Model Evaluation based on Accuracy Score
X_train_prediction = model.predict(X_train) # label output for X_train values trained
training_data_accuracy = accuracy_score(X_train_prediction,Y_train) # Comparing predicted values and actual Y_train labels
print('Accuracy on Training Data : ',training_data_accuracy) # 90.4% Accuracy

Accuracy on Training Data :  0.9045662100456621


In [34]:
# Accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print('Accuracy on Training Data : ',test_data_accuracy) # 89.7%

Accuracy on Training Data :  0.8977480219111381


In [41]:
SingleLinePredict = X_test.tail(1)
prediction = model.predict(SingleLinePredict)
print(prediction)

[0]


In [43]:
FilePredict = X_test.tail(10)
prediction = model.predict(FilePredict)
print(prediction)

[0 0 1 0 1 1 1 1 0 0]
