# Credit Card Fraud Detection

* Build a machine learning model to identify fraudulent credit card transactions.
* Preprocess and normalize the transaction data, handle class imbalance issues, and split the dataset into training and testing sets.
* Train a classification algorithm, such as logistic regression or random forests, to classify transactions as fraudulent or genuine.
* Evaluate the model's performance using metrics like precision, recall,and F1-score, and consider techniques like oversampling orundersampling for improving results.

In [1]:
#importing the required libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
credit_c=pd.read_csv('creditcard.csv')
credit_c.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
credit_c.shape

(284807, 31)

In [4]:
credit_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
credit_c.isna().any()

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

In [6]:
credit_c.isnull().any()

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

In [7]:
credit_c.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [8]:
credit_c['Class'].unique()

array([0, 1], dtype=int64)

In [9]:
credit_c['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

* 0 normal transaction 
* 1 fraud transaction

In [10]:
x=credit_c.drop('Class',axis=1)
y=credit_c['Class']

In [11]:
x.shape

(284807, 30)

In [12]:
y.shape

(284807,)

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.3,stratify=y)

In [14]:
x_train.shape

(199364, 30)

In [15]:
x_test.shape

(85443, 30)

In [16]:
y_train.shape

(199364,)

In [17]:
y_test.shape

(85443,)

In [18]:
y_train.value_counts()

0    199020
1       344
Name: Class, dtype: int64

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

In [21]:
y_pred=rf.predict(x_test)

In [22]:
y_pre_train=rf.predict(x_train)

In [23]:
print(classification_report(y_train,y_pre_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       1.00      1.00      1.00       344

    accuracy                           1.00    199364
   macro avg       1.00      1.00      1.00    199364
weighted avg       1.00      1.00      1.00    199364



In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.84      0.89       148

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [47]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score
accuracy=accuracy_score(y_train,y_pre_train)
precision=precision_score(y_train,y_pre_train)
recall=recall_score(y_train,y_pre_train)
f1_score=f1_score(y_train,y_pre_train)
print("The accuracy of training data is",accuracy)
print("The precision of training data is",precision)
print("The recall of training data is",recall)
print("The f1_score of training data is",f1_score)

The accuracy of training data is 0.9999949840492767
The precision of training data is 1.0
The recall of training data is 0.997093023255814
The f1_score of training data is 0.9985443959243085


In [48]:
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
print("The accuracy of testing data is",accuracy)
print("The precision of testing data is",precision)
print("The recall of testing data is",recall)

The accuracy of testing data is 0.999602073897218
The precision of testing data is 0.9253731343283582
The recall of testing data is 0.8378378378378378


In [26]:
params={
    'max_depth': [2,3],
    'min_samples_leaf': [5,10,15],
    'n_estimators': [15,18,20]
}

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid_search=GridSearchCV(estimator=rf,
                        param_grid=params,
                        cv=4,
                        scoring="accuracy")

In [29]:
grid_search.fit(x_train,y_train)

In [30]:
rf_best=grid_search.best_estimator_
rf_best

In [31]:
params={
    'max_depth': [4,6,8],
    'min_samples_leaf': [1,3,5],
    'n_estimators': [18]
}

In [32]:
grid_search=GridSearchCV(estimator=rf,
                        param_grid=params,
                        cv=4,
                        scoring="accuracy")

In [33]:
grid_search.fit(x_train,y_train)

In [34]:
rf_best=grid_search.best_estimator_
rf_best

In [35]:
#final parametrs
params={
    'max_depth': [8],
    'min_samples_leaf': [5],
    'n_estimators': [18]
}

In [36]:
grid_search=GridSearchCV(estimator=rf,
                        param_grid=params,
                        cv=4,
                        scoring="accuracy")

In [37]:
grid_search.fit(x_train,y_train)

In [38]:
rf_best=grid_search.best_estimator_
rf_best

In [39]:
y_pred=rf_best.predict(x_test)

In [40]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.93      0.84      0.88       148

    accuracy                           1.00     85443
   macro avg       0.96      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [41]:
y_pre_train=rf.predict(x_train)

In [42]:
print(classification_report(y_train,y_pre_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       1.00      1.00      1.00       344

    accuracy                           1.00    199364
   macro avg       1.00      1.00      1.00    199364
weighted avg       1.00      1.00      1.00    199364

