In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('/content/fraudTest.csv')
test_df = pd.read_csv('/content/fraudTrain.csv')
df1 = pd.concat([df,test_df])

In [None]:
df1.shape

(1396589, 23)

In [None]:
df1.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1396589.0,1396589.0,1396589.0,1396588.0,1396588.0,1396588.0,1396588.0,1396588.0,1396588.0,1396588.0,1396588.0
mean,363702.1,4.170572e+17,69.93579,48807.14,38.53846,-90.22398,88695.24,1357114000.0,38.53813,-90.22402,0.005016512
std,224914.8,1.308769e+18,160.9311,26882.51,5.06927,13.74678,301614.8,20507600.0,5.103601,13.75847,0.07064949
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02742,-166.6716,0.0
25%,174573.0,180040000000000.0,9.64,26237.0,34.6689,-96.798,743.0,1339566000.0,34.7457,-96.89885,0.0
50%,349147.0,3520550000000000.0,47.37,48174.0,39.3543,-87.4769,2443.0,1351174000.0,39.36981,-87.43675,0.0
75%,523720.0,4642255000000000.0,83.04,72011.0,41.8948,-80.158,20328.0,1378044000.0,41.95359,-80.23953,0.0
max,840869.0,4.992346e+18,28948.9,99921.0,66.6933,-67.9503,2906700.0,1388534000.0,67.51027,-66.95203,1.0


In [None]:
df1 = df1.drop('Unnamed: 0', axis = 1)

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396589 entries, 0 to 840869
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1396589 non-null  object 
 1   cc_num                 1396589 non-null  int64  
 2   merchant               1396589 non-null  object 
 3   category               1396589 non-null  object 
 4   amt                    1396589 non-null  float64
 5   first                  1396589 non-null  object 
 6   last                   1396589 non-null  object 
 7   gender                 1396589 non-null  object 
 8   street                 1396589 non-null  object 
 9   city                   1396588 non-null  object 
 10  state                  1396588 non-null  object 
 11  zip                    1396588 non-null  float64
 12  lat                    1396588 non-null  float64
 13  long                   1396588 non-null  float64
 14  city_pop           

In [None]:
df1.isna().sum()

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     1
state                    1
zip                      1
lat                      1
long                     1
city_pop                 1
job                      1
dob                      1
trans_num                1
unix_time                1
merch_lat                1
merch_long               1
is_fraud                 1
dtype: int64

In [None]:
fraud_counts = df1['is_fraud'].value_counts()

fig = px.pie(names=fraud_counts.index, values=fraud_counts.values, title='Fraudulent Transactions Distribution')

fig.show()

In [None]:
le = LabelEncoder()
for column in df1.columns:
    if df1[column].dtype == 'object':
        df1[column] = le.fit_transform(df1[column])

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396589 entries, 0 to 840869
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1396589 non-null  int64  
 1   cc_num                 1396589 non-null  int64  
 2   merchant               1396589 non-null  int64  
 3   category               1396589 non-null  int64  
 4   amt                    1396589 non-null  float64
 5   first                  1396589 non-null  int64  
 6   last                   1396589 non-null  int64  
 7   gender                 1396589 non-null  int64  
 8   street                 1396589 non-null  int64  
 9   city                   1396589 non-null  int64  
 10  state                  1396589 non-null  int64  
 11  zip                    1396588 non-null  float64
 12  lat                    1396588 non-null  float64
 13  long                   1396588 non-null  float64
 14  city_pop           

In [None]:
X_train, X_test = train_test_split(df1,test_size = 0.2, random_state = 42)

In [None]:
import pandas as pd

# Remove rows with NaN values
X_train = X_train.dropna()


In [None]:
isf = IsolationForest()
isf.fit(X_train)

In [None]:
# Remove rows with NaN values from test data
X_test = X_test.dropna()


In [None]:
X_test['y_pred'] = isf.predict(X_test)

In [None]:
X_test['y_pred'] = X_test['y_pred'].replace({-1: 1, 1: 0})
X_test.head()


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,y_pred
319652,1141129,3562679455732797,562,7,48.47,10,222,0,725,664,...,-76.1424,3862.0,278,312,1232533,1382291000.0,39.956114,-76.442418,0.0,0
366637,1187435,341546199006537,241,4,132.84,228,47,1,843,825,...,-165.6723,145.0,9,64,1339280,1384080000.0,64.973119,-165.374471,0.0,1
542703,1359000,6506982560413523,492,6,16.36,5,228,0,54,786,...,-93.8765,36438.0,437,703,375997,1388270000.0,34.149407,-94.715764,0.0,1
304365,1126071,3506592072985012,658,7,44.39,271,306,1,32,45,...,-90.5255,92608.0,161,946,768572,1381703000.0,38.672269,-89.624817,0.0,0
372128,366639,3559160581764413,38,7,48.84,7,25,1,692,708,...,-78.6847,1453.0,475,521,183234,1340291000.0,41.365028,-78.072261,0.0,0


In [None]:
y_pred = X_test['y_pred']
is_fraud = X_test['is_fraud']

report = classification_report(is_fraud, y_pred)

print(report)

              precision    recall  f1-score   support

         0.0       1.00      0.71      0.83    277897
         1.0       0.01      0.84      0.03      1421

    accuracy                           0.71    279318
   macro avg       0.51      0.78      0.43    279318
weighted avg       0.99      0.71      0.83    279318

