# Importing the Dependencies

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Loading the dataset into pandas Dataframe

In [6]:
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [7]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [8]:
 credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
31775,36510,1.497714,-0.874208,0.004261,-1.537368,-1.044513,-0.733905,-0.644635,-0.18651,-2.380495,...,-0.527761,-1.248016,0.145825,-0.061193,0.250252,-0.526898,0.003761,0.00795,24.9,0.0
31776,36512,-0.407854,0.940336,1.533733,-0.018123,-0.2194,-0.934602,0.597172,0.02461,-0.15185,...,-0.24757,-0.72583,-0.006221,0.322366,-0.249413,0.076587,0.25538,0.11734,1.79,0.0
31777,36513,-0.434226,-3.705556,0.194223,0.139216,-2.066818,1.220137,-0.278115,0.281452,-0.161302,...,0.620431,0.040315,-0.690176,-0.226844,-0.123369,-0.291412,-0.080514,0.155457,861.12,0.0
31778,36513,-0.274278,1.004565,1.33643,-0.164777,0.404935,-0.418731,0.853252,-0.148772,-0.546969,...,-0.277437,-0.579135,-0.097793,-0.037926,-0.157882,0.044814,0.078376,-0.137035,9.72,0.0
31779,36513,1.295137,0.096775,0.348134,0.629896,-0.1727,-0.15435,,,,...,,,,,,,,,,


Dataset Info

In [9]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31780 entries, 0 to 31779
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    31780 non-null  int64  
 1   V1      31780 non-null  float64
 2   V2      31780 non-null  float64
 3   V3      31780 non-null  float64
 4   V4      31780 non-null  float64
 5   V5      31780 non-null  float64
 6   V6      31780 non-null  float64
 7   V7      31779 non-null  float64
 8   V8      31779 non-null  float64
 9   V9      31779 non-null  float64
 10  V10     31779 non-null  float64
 11  V11     31779 non-null  float64
 12  V12     31779 non-null  float64
 13  V13     31779 non-null  float64
 14  V14     31779 non-null  float64
 15  V15     31779 non-null  float64
 16  V16     31779 non-null  float64
 17  V17     31779 non-null  float64
 18  V18     31779 non-null  float64
 19  V19     31779 non-null  float64
 20  V20     31779 non-null  float64
 21  V21     31779 non-null  float64
 22

check number of missing value in each column

In [10]:
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,1
V8,1
V9,1


Check distribution of legit transcations & fraudulent transactions

In [11]:
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,31677
1.0,102


This dataset is highly unbalanced

0 --> legit transaction
1 --> fraudulent transaction

In [12]:
#seprating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(31677, 31)
(102, 31)


In [16]:
#stastical measure of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,31677.0
mean,81.082407
std,223.072655
min,0.0
25%,6.87
50%,20.0
75%,73.61
max,7879.42


In [17]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,102.0
mean,91.237451
std,248.270971
min,0.0
25%,1.0
50%,3.44
75%,99.99
max,1809.68


In [18]:
#compare values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,22335.407141,-0.188381,0.085855,0.759238,0.192821,-0.192581,0.09593,-0.095538,0.019413,0.324683,...,0.041991,-0.03532,-0.118406,-0.039544,0.009212,0.133785,0.02214,0.01034,0.003013,81.082407
1.0,20334.872549,-7.761095,5.946853,-11.03523,5.926171,-5.529543,-2.295415,-7.767215,3.904551,-2.99729,...,0.679987,0.6419,-0.352131,-0.317568,-0.241053,0.314412,0.177056,0.811372,0.118708,91.237451


Under Sampling

Build Sample dataset containing similar distribution for legit transaction and Fraud transaction

Number of fraudral transaction : 102

In [20]:
legit_sample = legit.sample(n = 102)

Concating two dataFrames

In [25]:
new_dataset = pd.concat([legit_sample , fraud  ] , axis = 0)

In [26]:
print(new_dataset)

        Time        V1        V2        V3        V4        V5        V6  \
22269  32139  1.106814 -0.385705  1.300660  0.825234 -1.336160 -0.151016   
20062  30759  1.126807  0.528694  0.461908  2.241647  0.289584  0.379820   
16265  27662 -1.479250  1.041411  2.238715 -0.264967 -0.064540 -0.293190   
6095    6960 -3.741565  0.460549  1.835173  1.359940  0.696712  2.694758   
11337  19754  1.285225  1.335819 -1.671419  1.742738  1.073445 -1.520790   
...      ...       ...       ...       ...       ...       ...       ...   
30398  35906 -3.519030  4.140867 -3.628202  5.505672 -4.057463 -0.905945   
30442  35926 -3.896583  4.518355 -4.454027  5.547453 -4.121459 -1.163407   
30473  35942 -4.194074  4.382897 -5.118363  4.455230 -4.812621 -1.224645   
30496  35953 -4.844372  5.649439 -6.730396  5.252842 -4.409566 -1.740767   
31002  36170 -5.685013  5.776516 -7.064977  5.902715 -4.715564 -1.755633   

             V7        V8        V9  ...       V21       V22       V23  \
22269 -0.8568

In [27]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
22269,32139,1.106814,-0.385705,1.30066,0.825234,-1.33616,-0.151016,-0.856892,0.325004,0.980346,...,-0.046246,-0.098596,0.062366,0.513262,0.121376,0.293202,-0.002087,0.015911,17.0,0.0
20062,30759,1.126807,0.528694,0.461908,2.241647,0.289584,0.37982,0.046435,0.160497,-1.050933,...,-0.111698,-0.387126,0.043171,-0.335922,0.327473,-0.128915,-0.003164,0.007416,6.5,0.0
16265,27662,-1.47925,1.041411,2.238715,-0.264967,-0.06454,-0.29319,1.114375,-0.043867,-0.008313,...,-0.14463,-0.399288,-0.130856,0.580069,0.47919,-0.809354,-0.32042,0.055505,57.12,0.0
6095,6960,-3.741565,0.460549,1.835173,1.35994,0.696712,2.694758,1.651309,-2.041896,4.162809,...,-0.387728,1.559424,-0.597381,-0.893311,-1.007458,-0.563821,-5.107789,-2.194509,119.4,0.0
11337,19754,1.285225,1.335819,-1.671419,1.742738,1.073445,-1.52079,0.612086,-0.384398,1.008365,...,-0.404046,-0.856852,-0.247487,-0.525239,0.870562,-0.298888,0.006704,0.083566,1.0,0.0


In [28]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
30398,35906,-3.51903,4.140867,-3.628202,5.505672,-4.057463,-0.905945,-6.652031,2.634524,-4.679402,...,1.582556,0.77871,-0.135707,-0.004278,0.032706,0.362014,0.900925,0.554897,9.13,1.0
30442,35926,-3.896583,4.518355,-4.454027,5.547453,-4.121459,-1.163407,-6.805053,2.928356,-4.91713,...,1.691042,0.920021,-0.151104,0.011007,0.080303,0.412191,0.635789,0.50105,4.56,1.0
30473,35942,-4.194074,4.382897,-5.118363,4.45523,-4.812621,-1.224645,-7.281328,3.33225,-3.679659,...,1.550473,0.614573,0.028521,0.013704,-0.149512,-0.131687,0.473934,0.473757,14.46,1.0
30496,35953,-4.844372,5.649439,-6.730396,5.252842,-4.409566,-1.740767,-6.311699,3.449167,-5.416284,...,1.194888,-0.845753,0.190674,-0.216443,-0.325033,-0.270328,0.210214,0.391855,111.7,1.0
31002,36170,-5.685013,5.776516,-7.064977,5.902715,-4.715564,-1.755633,-6.958679,3.877795,-5.541529,...,1.128641,-0.96296,-0.110045,-0.177733,-0.089175,-0.049447,0.303445,0.21938,111.7,1.0


In [33]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,102
1.0,102


In [34]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,24808.372549,-0.173426,-0.241221,0.657203,0.410607,-0.387843,0.202989,-0.144874,0.012688,0.213187,...,-0.046845,-0.168041,-0.080634,-0.088366,-0.02944,0.174338,0.05085,0.005206,-0.089889,110.740294
1.0,20334.872549,-7.761095,5.946853,-11.03523,5.926171,-5.529543,-2.295415,-7.767215,3.904551,-2.99729,...,0.679987,0.6419,-0.352131,-0.317568,-0.241053,0.314412,0.177056,0.811372,0.118708,91.237451


Splitting data into training and testing

In [36]:
x = new_dataset.drop(columns = 'Class' , axis = 1)
y = new_dataset['Class']

In [37]:
print(x)

        Time        V1        V2        V3        V4        V5        V6  \
22269  32139  1.106814 -0.385705  1.300660  0.825234 -1.336160 -0.151016   
20062  30759  1.126807  0.528694  0.461908  2.241647  0.289584  0.379820   
16265  27662 -1.479250  1.041411  2.238715 -0.264967 -0.064540 -0.293190   
6095    6960 -3.741565  0.460549  1.835173  1.359940  0.696712  2.694758   
11337  19754  1.285225  1.335819 -1.671419  1.742738  1.073445 -1.520790   
...      ...       ...       ...       ...       ...       ...       ...   
30398  35906 -3.519030  4.140867 -3.628202  5.505672 -4.057463 -0.905945   
30442  35926 -3.896583  4.518355 -4.454027  5.547453 -4.121459 -1.163407   
30473  35942 -4.194074  4.382897 -5.118363  4.455230 -4.812621 -1.224645   
30496  35953 -4.844372  5.649439 -6.730396  5.252842 -4.409566 -1.740767   
31002  36170 -5.685013  5.776516 -7.064977  5.902715 -4.715564 -1.755633   

             V7        V8        V9  ...       V20       V21       V22  \
22269 -0.8568

In [38]:
print(y)

22269    0.0
20062    0.0
16265    0.0
6095     0.0
11337    0.0
        ... 
30398    1.0
30442    1.0
30473    1.0
30496    1.0
31002    1.0
Name: Class, Length: 204, dtype: float64


In [41]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , stratify=y , random_state = 2)

In [42]:
print(x.shape , x_train.shape , x_test.shape)

(204, 30) (163, 30) (41, 30)


Model Training

In [44]:
model = LogisticRegression()

In [46]:
#training the logistic regression Model with trained data
model.fit(x_train , y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy score

In [47]:
x_train_prediction = model.predict(x_train)
train_data_accuracy = accuracy_score(x_train_prediction , y_train)

In [48]:
print('Accuracy on training data :' ,train_data_accuracy)

Accuracy on training data : 0.9754601226993865


In [50]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [60]:
print('Accuracy on testing data:',test_data_accuracy)

Accuracy on testing data: 0.9512195121951219
