# CREDIT CARD FRAUD DETECTION

> Credit Card Fraud Detection is a crucial machine learning project with
> profound implications. It aims to safeguard financial transactions by
> identifying fraudulent activities. Leveraging advanced algorithms and
> historical transaction data, this project analyzes patterns and anomalies in
> credit card usage. It offers a proactive defense against fraud, preventing
> financial losses for both cardholders and institutions.


IMPORTING IMPORTANT LIBRARIES

In [81]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)

LOADING THE DATASET

In [82]:
df = pd.read_csv('./creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


### EXPLORATION ON THE DATASET

In [83]:
df.shape

(168486, 31)

DATAYPES

In [84]:
df.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class     float64
dtype: object

CHECKING FOR NULL VALUES

In [85]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

VERIFYING THE DATASET'S BALANCE

In [86]:
fraud_count = df["Class"].value_counts()
fraud_rate = 100*fraud_count/df.shape[0]
fraud_data = pd.concat([fraud_count, fraud_rate], axis=1).reset_index()
fraud_data.columns = ['Class', 'Count', 'Percentage']

fraud_data

Unnamed: 0,Class,Count,Percentage
0,0.0,168125,99.785739
1,1.0,360,0.213668


### DATA PREPARATION

In [87]:
df_fraud = df[df['Class']==1]
df_not_fraud = df[df['Class']==0]
df_not_fraud_sampled = df_not_fraud.sample(df_fraud.shape[0], replace=False, random_state=101)

df_balanced = pd.concat([df_not_fraud_sampled, df_fraud], axis=0).sample(frac=1, replace=False, random_state=101).reset_index().drop('index', axis=1)
df_balanced

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,16057.0,-0.596246,1.257064,1.955108,0.743800,0.651815,-0.235692,0.868243,-0.185393,0.424833,-0.174779,2.062328,-2.308946,1.118911,1.877837,-0.184930,-0.062550,0.175943,0.464428,0.975217,0.214845,-0.634021,-1.440349,-0.124137,-0.139307,0.093783,-0.766471,0.115173,-0.097334,0.01,0.0
1,67078.0,1.140170,0.329140,0.204221,1.126954,0.010023,-0.371180,0.206333,-0.003628,-0.455069,0.204202,1.685107,0.951845,-0.308274,0.735976,0.289052,-0.188459,-0.262835,-0.316301,-0.483101,-0.175614,0.064271,0.202669,-0.052867,0.215435,0.584600,-0.342454,0.012991,0.006017,9.99,0.0
2,68961.0,1.307549,1.357103,-1.914968,1.440154,1.340693,-1.326348,0.831450,-0.394173,-0.552049,-1.580598,0.254922,-0.186002,1.082872,-3.642531,1.014370,1.229679,2.341371,1.474813,-0.529156,0.059428,-0.253205,-0.571617,-0.324952,-0.750591,1.005799,-0.245824,0.045889,0.090770,1.00,0.0
3,81466.0,1.372556,-1.490751,-0.267941,-1.822659,-0.772632,0.771978,-1.171349,0.258612,-1.722098,1.553980,-0.327605,-1.548293,-1.039796,0.180251,0.541186,0.232226,-0.024739,0.603920,0.411408,-0.224407,-0.455270,-1.251687,-0.040524,-1.823270,0.154253,-0.364068,0.017755,0.008739,102.00,0.0
4,102669.0,-5.603690,5.222193,-7.516830,8.117724,-2.756858,-1.574565,-6.330343,2.998419,-4.508167,-7.334377,7.188724,-10.655181,2.594680,-10.242859,-0.191158,-5.504334,-8.697777,-1.934225,1.958750,0.227526,1.242896,0.428408,-0.101184,-0.520199,-0.176938,0.461450,-0.106625,-0.479662,0.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,109096.0,-0.498484,0.807144,2.397165,-0.114586,0.123411,-0.279939,0.499367,-0.234155,1.556540,-1.136382,0.262073,-2.732961,1.378987,1.310860,0.677832,-0.321252,0.370055,0.465449,0.471503,0.035942,-0.306993,-0.433355,-0.295006,-0.124268,0.240131,-0.723843,-0.078326,-0.146917,2.12,0.0
716,72327.0,-4.198735,0.194121,-3.917586,3.920748,-1.875486,-2.118933,-3.614445,1.687884,-2.189871,-4.684233,4.376907,-5.007441,-1.304745,-6.192475,1.002404,-3.316934,-6.188834,-1.040413,1.233044,1.003350,0.801312,-0.183001,-0.440387,0.292539,-0.144967,-0.251744,1.249414,-0.131525,238.90,1.0
717,64585.0,1.080433,0.962831,-0.278065,2.743318,0.412364,-0.320778,0.041290,0.176170,-0.966952,-0.194120,2.140057,-0.276309,-1.191306,-1.880275,0.398272,1.367433,1.522662,1.026317,-1.577070,-0.172659,-0.008996,-0.057036,-0.053692,-0.026373,0.400300,0.072828,0.027043,0.063238,0.00,1.0
718,73750.0,-0.342666,1.143480,1.294908,0.059965,0.045369,-0.974048,0.730766,-0.085282,-0.426875,-0.518835,-0.180298,0.029813,0.327544,-0.470990,0.871374,0.374417,-0.005349,-0.162160,-0.090940,0.130025,-0.260963,-0.673105,-0.015029,0.334267,-0.152368,0.071457,0.246570,0.098778,4.49,0.0


VERIFYING BALANCE

In [88]:
fraud_count = df_balanced["Class"].value_counts()
fraud_rate = 100*fraud_count/df_balanced.shape[0]
fraud_data = pd.concat([fraud_count, fraud_rate], axis=1).reset_index()
fraud_data.columns = ['Class', 'Count', 'Percentage']

fraud_data

Unnamed: 0,Class,Count,Percentage
0,0.0,360,50.0
1,1.0,360,50.0


TRAIN-TEST-SPLIT

In [89]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced.drop('Class', axis=1), df_balanced['Class'], test_size=0.2, random_state=101)

print(f'''X_train: {X_train.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}''')

X_train: (576, 30)
X_test: (144, 30)
y_train: (576,)
y_test: (144,)


### FITTING A RANDOM FOREST MODEL
> Logistic model pipeline

In [90]:
randomForestModel = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

randomForestModel.fit(X_train, y_train)

PREDICTION ANALYSIS

In [91]:
y_pred_logis = randomForestModel.predict(X_test)
y_pred_logis

array([1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
       1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1.])

CLASSIFICATION REPORT

In [92]:
cr = classification_report(y_test, y_pred_logis)
print(cr)

              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97        82
         1.0       0.95      0.97      0.96        62

    accuracy                           0.97       144
   macro avg       0.96      0.97      0.96       144
weighted avg       0.97      0.97      0.97       144



SAVING THE MODEL

In [93]:
with open('./model.pkl', 'wb') as fp:
    pickle.dump(randomForestModel, fp)