In [1]:
import pandas as pd
import numpy as np

In [2]:
#Reading data
df = pd.read_csv("C:/Users/gaurm/Desktop/MACHINE LEARNING/INSAD/Fraud.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.shape

(6362620, 11)

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [5]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [6]:
#Changing columns name in lowercase
df.columns= df.columns.str.strip().str.lower()
df.columns

Index(['step', 'type', 'amount', 'nameorig', 'oldbalanceorg', 'newbalanceorig',
       'namedest', 'oldbalancedest', 'newbalancedest', 'isfraud',
       'isflaggedfraud'],
      dtype='object')

In [7]:
#Changing categorical value ointo numerical
df1 = df.copy(deep = 'true')
df1['type']=df1['type'].map({'PAYMENT':1 ,'TRANSFER':2, 'CASH_OUT':3, 'DEBIT':4, 'CASH_IN':5})
df1 = df1.drop(columns = ['nameorig','namedest'])
df1.head()

Unnamed: 0,step,type,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud,isflaggedfraud
0,1,1,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,2,181.0,181.0,0.0,0.0,0.0,1,0
3,1,3,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,1,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [8]:

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [9]:
#calculating VIf to check the MUlti-collinearity between variables -

vif_data = pd.DataFrame()                         
vif_data["feature"] = df1.columns
vif_data["VIF"] = [variance_inflation_factor(df1.values, i)
                          for i in range(len(df1.columns))]
print(vif_data)

          feature         VIF
0            step    2.468656
1            type    3.278499
2          amount    4.183290
3   oldbalanceorg  590.799060
4  newbalanceorig  599.771072
5  oldbalancedest   74.291723
6  newbalancedest   85.786786
7         isfraud    1.198013
8  isflaggedfraud    1.002676


In [10]:
x = df1.drop(columns=['isfraud'])
y = df1['isfraud']

In [11]:
y.value_counts()

0    6354407
1       8213
Name: isfraud, dtype: int64

In [12]:
#Balancing the data using SMOTE Technique
from imblearn.over_sampling import SMOTE

In [13]:
over_sample = SMOTE(random_state=0)
x,y = over_sample.fit_resample(x,y)
y.value_counts() 

0    6354407
1    6354407
Name: isfraud, dtype: int64

In [14]:
#Scaling values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=0)

In [16]:
#Making logistic regression model
from sklearn.linear_model import LogisticRegression

In [17]:
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
#Predicting 
y_pred = log_reg.predict(x_test)

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
confusion_matrix(y_test,y_pred)

array([[1827272,   78703],
       [ 298980, 1607690]], dtype=int64)

In [20]:
accuracy_score(y_test, y_pred)

0.9009393741090502

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91   1905975
           1       0.95      0.84      0.89   1906670

    accuracy                           0.90   3812645
   macro avg       0.91      0.90      0.90   3812645
weighted avg       0.91      0.90      0.90   3812645

