<a href="https://www.kaggle.com/code/krippanandhini/credit-card-fraud-detection?scriptVersionId=163955930" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
test_df = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')
df1 = pd.concat([df,test_df])

In [3]:
df1.shape

(1852394, 23)

In [4]:
df1.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,537193.4,4.17386e+17,70.06357,48813.26,38.53931,-90.22783,88643.67,1358674000.0,38.53898,-90.22794,0.005210015
std,366911.0,1.309115e+18,159.254,26881.85,5.07147,13.74789,301487.6,18195080.0,5.105604,13.75969,0.07199217
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02742,-166.6716,0.0
25%,231549.0,180042900000000.0,9.64,26237.0,34.6689,-96.798,741.0,1343017000.0,34.74012,-96.89944,0.0
50%,463098.0,3521417000000000.0,47.45,48174.0,39.3543,-87.4769,2443.0,1357089000.0,39.3689,-87.44069,0.0
75%,833575.8,4642255000000000.0,83.1,72042.0,41.9404,-80.158,20328.0,1374581000.0,41.95626,-80.24511,0.0
max,1296674.0,4.992346e+18,28948.9,99921.0,66.6933,-67.9503,2906700.0,1388534000.0,67.51027,-66.9509,1.0


In [5]:
df1 = df1.drop('Unnamed: 0', axis = 1)

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int64(5),

In [7]:
df1.isna().sum()

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [8]:
fraud_counts = df1['is_fraud'].value_counts()

fig = px.pie(names=fraud_counts.index, values=fraud_counts.values, title='Fraudulent Transactions Distribution')

fig.show()

In [9]:
le = LabelEncoder()
for column in df1.columns:
    if df1[column].dtype == 'object':
        df1[column] = le.fit_transform(df1[column])

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  int64  
 1   cc_num                 int64  
 2   merchant               int64  
 3   category               int64  
 4   amt                    float64
 5   first                  int64  
 6   last                   int64  
 7   gender                 int64  
 8   street                 int64  
 9   city                   int64  
 10  state                  int64  
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    int64  
 16  dob                    int64  
 17  trans_num              int64  
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int64(17)

In [11]:
X_train, X_test = train_test_split(df1,test_size = 0.2, random_state = 42)

In [12]:
isf = IsolationForest()
isf.fit(X_train)

In [13]:
X_test['y_pred'] = isf.predict(X_test)

In [14]:
X_test['y_pred'] = X_test['y_pred'].replace({-1: 1, 1: 0})
X_test.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,y_pred
244469,1514964,5359543825610251,285,2,59.91,245,144,1,185,62,...,-111.1439,18182,163,570,870220,1379488419,45.274075,-111.649432,0,0
434906,1702272,5540636818935089,284,12,3.96,203,141,1,338,440,...,-71.1605,76383,216,727,845611,1386265705,43.356278,-71.008959,0,0
354659,349469,2720894374956739,24,2,51.17,28,194,0,349,545,...,-82.8823,16305,374,7,1068756,1339759484,42.372483,-83.50802,0,0
197113,1468390,6011438889172900,531,12,2.06,8,5,0,421,204,...,-91.3336,5161,147,871,1174654,1377816625,33.833389,-91.158293,0,1
468148,460913,60495593109,318,13,6.58,274,109,1,454,194,...,-96.743,1263321,460,93,1134933,1343231435,32.458643,-96.577001,0,1


In [15]:
y_pred = X_test['y_pred']
is_fraud = X_test['is_fraud']

report = classification_report(is_fraud, y_pred)

print(report)

              precision    recall  f1-score   support

           0       1.00      0.71      0.83    368526
           1       0.02      0.94      0.03      1953

    accuracy                           0.71    370479
   macro avg       0.51      0.83      0.43    370479
weighted avg       0.99      0.71      0.83    370479

