<a href="https://colab.research.google.com/github/Mona-Bhagat/Creditcard_frauddetection/blob/main/Creditcardfrauddetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries Used

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



# Code

In [8]:
# credit class 0 represents regular transaction whereas 1 represents fraudulent transaction
ccfdf = pd.read_csv("creditcard.csv")

In [9]:
ccfdf.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [16]:
# dataset informations
ccfdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269312 entries, 0 to 269311
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    269312 non-null  float64
 1   V1      269312 non-null  float64
 2   V2      269311 non-null  float64
 3   V3      269311 non-null  float64
 4   V4      269311 non-null  float64
 5   V5      269311 non-null  float64
 6   V6      269311 non-null  float64
 7   V7      269311 non-null  float64
 8   V8      269311 non-null  float64
 9   V9      269311 non-null  float64
 10  V10     269311 non-null  float64
 11  V11     269311 non-null  float64
 12  V12     269311 non-null  float64
 13  V13     269311 non-null  float64
 14  V14     269311 non-null  float64
 15  V15     269311 non-null  float64
 16  V16     269311 non-null  float64
 17  V17     269311 non-null  float64
 18  V18     269311 non-null  float64
 19  V19     269311 non-null  float64
 20  V20     269311 non-null  float64
 21  V21     26

In [13]:
#checking the number of missing values
ccfdf.isnull().sum()

Time      0
V1        0
V2        1
V3        1
V4        1
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [15]:
# distribution of legit transactions & fraudulent transactions
ccfdf['Class'].value_counts()

0.0    268830
1.0       481
Name: Class, dtype: int64

Since number of fradulent cases is way less than the regular, this represents a highly balanced dataset. This dataset can't be used to train the ML model. To ovecome this drawback we seperate the data to get the data for training set

In [20]:
# separating the data for analysis
regular = ccfdf[ccfdf.Class == 0]
fraud = ccfdf[ccfdf.Class == 1]

In [21]:
print(regular.shape)
print(fraud.shape)


(268830, 31)
(481, 31)


In [22]:
# statistical measures of the data
regular.Amount.describe()

count    268830.000000
mean         89.452002
std         248.011506
min           0.000000
25%           5.990000
50%          22.700000
75%          78.897500
max       19656.530000
Name: Amount, dtype: float64

In [23]:
fraud.Amount.describe()

count     481.000000
mean      121.239605
std       257.722080
min         0.000000
25%         1.000000
50%         8.640000
75%       104.810000
max      2125.870000
Name: Amount, dtype: float64

Building a subsample of regular transactions so that we have equal number transactions from regular as well as fraudulent transactions dataset

In [31]:
regular_sample = regular.sample(n=481)

In [33]:
new_dataset = pd.concat([regular_sample, fraud], axis=0)
# axis as 0 ensures fraud transactions are added below and not to the right of dataset

In [34]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
245845,152932.0,1.919428,-0.900757,-2.838006,-0.425763,0.416044,-1.15944,0.836356,-0.68421,-1.279219,...,-0.006271,0.194557,-0.269786,0.575591,0.517702,0.97974,-0.158532,-0.053245,200.0,0.0
38740,39523.0,1.18739,-1.570869,0.496969,-1.004018,-1.833969,-0.645455,-0.838699,-0.191019,-1.463554,...,-0.35239,-0.825444,-0.028087,0.377465,0.275223,-0.348846,0.017449,0.050299,169.4,0.0
193830,130313.0,0.603269,-2.99675,-2.583475,-0.245412,1.015318,4.003766,-0.274257,0.819149,0.828488,...,0.613196,0.153614,-0.404657,0.792707,-0.577711,0.59544,-0.152484,0.070795,729.0,0.0
44017,41752.0,-1.726374,0.85666,-0.068672,0.23965,-1.512271,0.147056,1.582122,0.589325,-0.757667,...,0.260578,0.423151,0.154138,0.065408,-0.09614,-0.526573,-0.088517,-0.111597,360.0,0.0
229769,146043.0,1.862275,-0.427858,-1.182396,-0.082892,0.553052,0.97591,-0.275689,0.302763,0.753685,...,-0.032711,0.252668,0.243009,-1.313893,-0.271775,0.004813,0.026698,-0.077202,15.29,0.0


In [35]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
263080,160791.0,2.132386,0.705608,-3.530759,0.514779,1.527175,-1.716268,1.132791,-0.574214,0.128904,...,0.163739,0.70391,-0.245076,0.460049,0.920281,-0.216586,-0.026219,-0.025001,1.0,1.0
263274,160870.0,-0.644278,5.002352,-8.252739,7.756915,-0.216267,-2.751496,-3.358857,1.406268,-4.403852,...,0.587728,-0.605759,0.033746,-0.75617,-0.008172,0.532772,0.66397,0.192067,0.77,1.0
263324,160895.0,-0.84829,2.719882,-6.19907,3.044437,-3.30191,-1.992117,-3.734902,1.520079,-2.548788,...,1.125229,0.805258,0.199119,0.035206,0.012159,0.601658,0.137468,-0.171397,127.14,1.0
263877,161154.0,-3.387601,3.977881,-6.978585,1.657766,-1.1005,-3.599487,-3.686651,1.942252,-3.065089,...,1.043587,0.262189,-0.479224,-0.326638,-0.156939,0.113807,0.354124,0.287592,0.38,1.0
268375,163181.0,-5.238808,0.623013,-5.784507,1.678889,-0.364432,-0.477295,-4.276132,-0.695173,-2.971644,...,-0.32614,1.509239,-0.215966,-0.245727,0.893041,0.865758,0.854657,-0.964482,39.98,1.0


In [36]:
new_dataset['Class'].value_counts()

0.0    481
1.0    481
Name: Class, dtype: int64

In [38]:
# check if the mean of new dataset is very different from the original dataset.
# As we can see that the means have changed very little, indicating that the nature of transactions in sample dataset is still very similar to the original one
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,91354.748441,0.173868,-0.041639,0.055028,-0.040553,-0.047587,-0.045751,0.060288,0.006676,0.068941,...,-0.027614,-0.023272,-0.017428,0.035438,0.031414,0.016852,0.059421,0.024733,0.017933,86.335239
1.0,78754.848233,-4.843897,3.709169,-7.124813,4.604331,-3.241169,-1.403691,-5.66008,0.613744,-2.620426,...,0.370551,0.737419,0.004536,-0.051545,-0.110216,0.047133,0.049161,0.164155,0.074541,121.239605


Splitting the data into Features & Targets

In [39]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [41]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
245845  152932.0  1.919428 -0.900757 -2.838006 -0.425763  0.416044 -1.159440   
38740    39523.0  1.187390 -1.570869  0.496969 -1.004018 -1.833969 -0.645455   
193830  130313.0  0.603269 -2.996750 -2.583475 -0.245412  1.015318  4.003766   
44017    41752.0 -1.726374  0.856660 -0.068672  0.239650 -1.512271  0.147056   
229769  146043.0  1.862275 -0.427858 -1.182396 -0.082892  0.553052  0.975910   
...          ...       ...       ...       ...       ...       ...       ...   
263080  160791.0  2.132386  0.705608 -3.530759  0.514779  1.527175 -1.716268   
263274  160870.0 -0.644278  5.002352 -8.252739  7.756915 -0.216267 -2.751496   
263324  160895.0 -0.848290  2.719882 -6.199070  3.044437 -3.301910 -1.992117   
263877  161154.0 -3.387601  3.977881 -6.978585  1.657766 -1.100500 -3.599487   
268375  163181.0 -5.238808  0.623013 -5.784507  1.678889 -0.364432 -0.477295   

              V7        V8        V9  .

In [42]:
print(Y)

245845    0.0
38740     0.0
193830    0.0
44017     0.0
229769    0.0
         ... 
263080    1.0
263274    1.0
263324    1.0
263877    1.0
268375    1.0
Name: Class, Length: 962, dtype: float64


*Split the data into Training data & Testing Data*

---





In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [44]:
print(X.shape, X_train.shape, X_test.shape)

(962, 30) (769, 30) (193, 30)


# Model Training with Logistic Regression

In [46]:
model = LogisticRegression()

In [48]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

# Model Evaluation and Accuracy Score

In [49]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [50]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9466840052015605


In [51]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [52]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9430051813471503
