In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go 
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import datetime


In [2]:
data = pd.read_csv('sample2.csv')

In [3]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,278,CASH_IN,330218.42,C632336343,20866.0,351084.42,C834976624,452419.57,122201.15,0,0
1,15,PAYMENT,11647.08,C1264712553,30370.0,18722.92,M215391829,0.0,0.0,0,0
2,10,CASH_IN,152264.21,C1746846248,106589.0,258853.21,C1607284477,201303.01,49038.8,0,0
3,403,TRANSFER,1551760.63,C333676753,0.0,0.0,C1564353608,3198359.45,4750120.08,0,0
4,206,CASH_IN,78172.3,C813403091,2921331.58,2999503.88,C1091768874,415821.9,337649.6,0,0


In [4]:
data.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
data.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [6]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,243.52458,180259.3,840544.0,862318.4,1092142.0,1217457.0,0.0014,2e-05
std,142.989564,539490.1,2905490.0,2940672.0,3123411.0,3349252.0,0.037391,0.004472
min,1.0,0.92,0.0,0.0,0.0,0.0,0.0,0.0
25%,155.0,13580.08,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,75680.36,13844.5,0.0,143238.6,221087.3,0.0,0.0
75%,335.0,209243.4,107073.2,147192.1,963958.9,1129805.0,0.0,0.0
max,736.0,36973900.0,33593210.0,33748550.0,154013500.0,154241100.0,1.0,1.0


In [7]:
data2 = data.drop(['step'],  axis=1)

In [8]:
labels = list(data['isFraud'].value_counts().index)
values = list(data['isFraud'].value_counts().values)
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

In [9]:
labels = list(data['type'].value_counts().index)
values = list(data['type'].value_counts().values)
fig = go.Figure(data= [go.Bar(x = labels, y = values)])
fig.show()

In [48]:
data2

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,CASH_IN,330218.42,C632336343,20866.00,351084.42,C834976624,452419.57,122201.15,0,0
1,PAYMENT,11647.08,C1264712553,30370.00,18722.92,M215391829,0.00,0.00,0,0
2,CASH_IN,152264.21,C1746846248,106589.00,258853.21,C1607284477,201303.01,49038.80,0,0
3,TRANSFER,1551760.63,C333676753,0.00,0.00,C1564353608,3198359.45,4750120.08,0,0
4,CASH_IN,78172.30,C813403091,2921331.58,2999503.88,C1091768874,415821.90,337649.60,0,0
...,...,...,...,...,...,...,...,...,...,...
49995,PAYMENT,43675.82,C1782723064,0.00,0.00,M11629473,0.00,0.00,0,0
49996,CASH_OUT,161639.72,C78683855,5579.00,0.00,C1595714382,0.00,161639.72,0,0
49997,PAYMENT,6043.60,C1416720725,92825.74,86782.14,M1744355995,0.00,0.00,0,0
49998,CASH_OUT,105978.55,C2013621784,29984.00,0.00,C1173715119,0.00,105978.55,0,0


In [49]:
data2[data2['isFraud'] == 1]['amount'].describe()

count    7.000000e+01
mean     1.413000e+06
std      2.352488e+06
min      7.360150e+03
25%      1.511107e+05
50%      5.640288e+05
75%      1.459703e+06
max      1.000000e+07
Name: amount, dtype: float64

In [50]:
amt_type = data2.groupby('type')['amount'].mean().reset_index()

In [51]:
fig= px.bar(amt_type, x='type', y='amount')
fig.show()

In [52]:
data2.columns

Index(['type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [53]:
data3=data2.drop([ "isFlaggedFraud"], axis=1)

In [54]:
#Tallying the balance
def balance_diff(data3):
    '''balance_diff checks whether the money debited from sender has exactly credited to the receiver
       then it creates a new column which indicates 1 when there is a deviation else 0'''
    #Sender's balance
    orig_change=data3['newbalanceOrig']-data3['oldbalanceOrg']
    orig_change=orig_change.astype(int)
    for i in orig_change:
        if i<0:
            data3['orig_txn_diff']=round(data3['amount']+orig_change,2)
        else:
            data3['orig_txn_diff']=round(data3['amount']-orig_change,2)
    data3['orig_txn_diff']=data3['orig_txn_diff'].astype(int)
    data3['orig_diff'] = [1 if n !=0 else 0 for n in data3['orig_txn_diff']] 
    
    #Receiver's balance
    dest_change=data3['newbalanceDest']-data3['oldbalanceDest']
    dest_change=dest_change.astype(int)
    for i in dest_change:
        if i<0:
            data3['dest_txn_diff']=round(data3['amount']+dest_change,2)
        else:
            data3['dest_txn_diff']=round(data3['amount']-dest_change,2)
    data3['dest_txn_diff']=data3['dest_txn_diff'].astype(int)
    data3['dest_diff'] = [1 if n !=0 else 0 for n in data3['dest_txn_diff']] 
    
    data3.drop(['orig_txn_diff','dest_txn_diff'],axis=1,inplace = True)
    
#Surge indicator
def surge_indicator(data3):
    '''Creates a new column which has 1 if the transaction amount is greater than the threshold
    else it will be 0'''
    data3['surge']=[1 if n>450000 else 0 for n in data3['amount']]

#Frequency indicator
def frequency_receiver(data3):
    '''Creates a new column which has 1 if the receiver receives money from many individuals
    else it will be 0'''
    data3['freq_dest'] = data3['nameDest'].map(data3['nameDest'].value_counts())
    data3['freq_dest'] = data3['freq_dest'].apply(lambda x: 1 if x > 20 else 0)

#Tracking the receiver as merchant or not
def merchant(data3):
    '''We also have customer ids which starts with M in Receiver name, it indicates merchant
    this function will flag if there is a merchant in receiver end '''
    values = ['M']
    conditions = list(map(data3['nameDest'].str.contains, values))
    data3['merchant'] = np.select(conditions, '1', '0')

In [55]:
balance_diff(data3)

data3['orig_diff'].value_counts()
data3['dest_diff'].value_counts()

dest_diff
1    30116
0    19884
Name: count, dtype: int64

In [56]:
surge_indicator(data3)
data3['surge'].value_counts()

surge
0    46683
1     3317
Name: count, dtype: int64

In [57]:
frequency_receiver(data3)
data3['freq_dest'].value_counts()

freq_dest
0    50000
Name: count, dtype: int64

In [59]:
max_size = data3['isFraud'].value_counts().max()

#Balancing the target label
lst = [data3]
for class_index, group in data3.groupby('isFraud'):
    lst.append(group.sample(max_size-len(group), replace=True))
best = pd.concat(lst)

In [64]:
fig = go.Figure(data=[go.Pie(labels=['Not Fraud','Fraud'], values=best['isFraud'].value_counts())])
fig.show()

In [65]:
data3.head(300)

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,orig_diff,dest_diff,surge,freq_dest
0,CASH_IN,330218.42,C632336343,20866.00,351084.42,C834976624,452419.57,122201.15,0,0,1,0,0
1,PAYMENT,11647.08,C1264712553,30370.00,18722.92,M215391829,0.00,0.00,0,1,1,0,0
2,CASH_IN,152264.21,C1746846248,106589.00,258853.21,C1607284477,201303.01,49038.80,0,0,1,0,0
3,TRANSFER,1551760.63,C333676753,0.00,0.00,C1564353608,3198359.45,4750120.08,0,1,0,1,0
4,CASH_IN,78172.30,C813403091,2921331.58,2999503.88,C1091768874,415821.90,337649.60,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,CASH_IN,141273.15,C80009629,24639886.52,24781159.68,C559367869,484939.60,219172.46,0,0,1,0,0
296,PAYMENT,4962.99,C1589115167,14752.00,9789.01,M1768343139,0.00,0.00,0,1,1,0,0
297,CASH_IN,242090.12,C264592116,17528542.66,17770632.78,C761150919,319311.17,77221.05,0,0,1,0,0
298,PAYMENT,1854.44,C1146107700,0.00,0.00,M1816242792,0.00,0.00,0,1,1,0,0


In [66]:
data3

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,orig_diff,dest_diff,surge,freq_dest
0,CASH_IN,330218.42,C632336343,20866.00,351084.42,C834976624,452419.57,122201.15,0,0,1,0,0
1,PAYMENT,11647.08,C1264712553,30370.00,18722.92,M215391829,0.00,0.00,0,1,1,0,0
2,CASH_IN,152264.21,C1746846248,106589.00,258853.21,C1607284477,201303.01,49038.80,0,0,1,0,0
3,TRANSFER,1551760.63,C333676753,0.00,0.00,C1564353608,3198359.45,4750120.08,0,1,0,1,0
4,CASH_IN,78172.30,C813403091,2921331.58,2999503.88,C1091768874,415821.90,337649.60,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,PAYMENT,43675.82,C1782723064,0.00,0.00,M11629473,0.00,0.00,0,1,1,0,0
49996,CASH_OUT,161639.72,C78683855,5579.00,0.00,C1595714382,0.00,161639.72,0,1,0,0,0
49997,PAYMENT,6043.60,C1416720725,92825.74,86782.14,M1744355995,0.00,0.00,0,1,1,0,0
49998,CASH_OUT,105978.55,C2013621784,29984.00,0.00,C1173715119,0.00,105978.55,0,1,0,0,0


In [67]:
type_mapping = {'PAYMENT': 1, 'TRANSFER': 2, 'CASH_IN': 3, 'CASH_OUT': 4, 'DEBIT': 5}

# Apply the mapping to create a new 'type_numeric' column
data3['type_numeric'] = data3['type'].map(type_mapping)

# Drop the original 'type' column
data3.drop(['type'], axis=1, inplace=True)

data3.head(40)

Unnamed: 0,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,orig_diff,dest_diff,surge,freq_dest,type_numeric
0,330218.42,C632336343,20866.0,351084.42,C834976624,452419.57,122201.15,0,0,1,0,0,3
1,11647.08,C1264712553,30370.0,18722.92,M215391829,0.0,0.0,0,1,1,0,0,1
2,152264.21,C1746846248,106589.0,258853.21,C1607284477,201303.01,49038.8,0,0,1,0,0,3
3,1551760.63,C333676753,0.0,0.0,C1564353608,3198359.45,4750120.08,0,1,0,1,0,2
4,78172.3,C813403091,2921331.58,2999503.88,C1091768874,415821.9,337649.6,0,0,1,0,0,3
5,915.13,C2002954533,0.0,0.0,M290849763,0.0,0.0,0,1,1,0,0,1
6,20603.87,C813757373,0.0,0.0,C823291717,558068.66,578672.53,0,1,0,0,0,4
7,58605.72,C1850864812,0.0,0.0,C618657299,585494.94,644100.66,0,1,0,0,0,4
8,4865.11,C886849972,0.0,0.0,M623175144,0.0,0.0,0,1,1,0,0,1
9,118131.63,C390714641,0.0,0.0,C366360355,8131691.35,8476246.86,0,1,1,0,0,4


In [34]:
#Splitting dependent and independent variable
data4=data3.copy()
X = data4.drop(['isFraud'], axis=1)
y=data4['isFraud']



In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [36]:
# Train-test split
X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size=0.7, random_state=111)

#Standardizing the numerical columns
col_names=['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']
features_train = X_train[col_names]
features_test = X_test[col_names]
scaler = StandardScaler().fit(features_train.values)
features_train = scaler.transform(features_train.values)
features_test = scaler.transform(features_test.values)
X_train[col_names] = features_train
X_test[col_names] =features_test

In [37]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
#Tokenzation of customer name to get a unique id
tokenizer_org = tf.keras.preprocessing.text.Tokenizer()
tokenizer_org.fit_on_texts(X_train['nameOrig'])

tokenizer_dest = tf.keras.preprocessing.text.Tokenizer()
tokenizer_dest.fit_on_texts(X_train['nameDest'])

# Create tokenized customer lists
customers_train_org = tokenizer_org.texts_to_sequences(X_train['nameOrig'])
customers_test_org = tokenizer_org.texts_to_sequences(X_test['nameOrig'])

customers_train_dest = tokenizer_dest.texts_to_sequences(X_train['nameDest'])
customers_test_dest = tokenizer_dest.texts_to_sequences(X_test['nameDest'])


# Pad sequences
X_train['customers_org'] = tf.keras.preprocessing.sequence.pad_sequences(customers_train_org, maxlen=1)
X_test['customers_org'] = tf.keras.preprocessing.sequence.pad_sequences(customers_test_org, maxlen=1)

X_train['customers_dest'] = tf.keras.preprocessing.sequence.pad_sequences(customers_train_dest, maxlen=1)
X_test['customers_dest'] = tf.keras.preprocessing.sequence.pad_sequences(customers_test_dest, maxlen=1)

In [39]:
X_train=X_train.drop(['nameOrig','nameDest'],axis=1)
X_train = X_train.reset_index(drop=True)

X_test=X_test.drop(['nameOrig','nameDest'],axis=1)
X_test = X_test.reset_index(drop=True)

In [40]:
from sklearn.ensemble import RandomForestClassifier


In [41]:
algorithm2 = RandomForestClassifier()

In [43]:
model2 = algorithm2.fit(X_train, y_train)

In [44]:
prediction2 = model2.predict(X_test)

In [45]:
accuracy2 = accuracy_score(y_test, prediction2)

In [46]:
accuracy2

0.9994666666666666

In [47]:
X_train.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,orig_diff,dest_diff,surge,freq_dest,type_numeric,customers_org,customers_dest
0,-0.137559,-0.289257,-0.293249,-0.114035,-0.061761,1,1,0,0,2,1,39
1,0.202741,0.624213,0.71057,0.040111,-0.086387,0,1,0,0,3,2,902
2,-0.318157,-0.289257,-0.293249,-0.352797,-0.364718,1,1,0,0,1,3,903
3,-0.271553,-0.289257,-0.293249,1.29366,1.172954,1,0,0,0,4,4,904
4,-0.081529,-0.289257,-0.293249,-0.235232,-0.215655,1,0,0,0,4,5,905


In [41]:
import pickle

In [42]:
pickle.dump(model2, open("modelFinals.pkl", "wb"))

In [43]:
X_train.columns

Index(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'orig_diff', 'dest_diff', 'surge', 'freq_dest',
       'type_numeric', 'customers_org', 'customers_dest'],
      dtype='object')

In [54]:
all_data = pd.concat([X_train, X_test])
all_labels = pd.concat([y_train, y_test])

# Make predictions on all data
all_predictions = model2.predict(all_data)

# Filter out the rows where the predicted label is fraud
fraudulent_rows = all_data[all_predictions == 1]

# Print out the fraudulent rows
print("Fraudulent Rows:")
print(fraudulent_rows)

Fraudulent Rows:
         amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
468    4.561479       0.659666       -0.293249       -0.352797   
972    0.818318      -0.067294       -0.293249       -0.338104   
992    1.775321       0.118566       -0.293249       -0.297855   
1201  -0.276027      -0.279828       -0.293249       -0.238147   
1795  -0.156840      -0.256680       -0.293249       -0.325246   
...         ...            ...             ...             ...   
11635  2.369280       0.233919       -0.293249       -0.352797   
13304  1.755094       0.114637       -0.293249       -0.297277   
13370  7.985853       1.324716       -0.293249       -0.313880   
13997  1.405873       0.046815       -0.293249       -0.171261   
14725  1.596918       0.083918       -0.293249       -0.352797   

       newbalanceDest  orig_diff  dest_diff  surge  freq_dest  type_numeric  \
468         -0.364718          1          1      1          0             2   
972         -0.163510          1