Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Data Collection and Processing

In [3]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [4]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
69410,53411,1.106584,-0.925436,0.661483,-1.66404,-1.075488,0.235442,-1.022176,0.343909,1.810465,...,0.302333,0.839735,-0.195164,-0.615158,0.279175,0.15156,0.04794,0.019442,68.02,0.0
69411,53411,-1.009202,-0.429729,2.109263,-0.718647,-0.389684,-0.038644,0.08958,0.31482,0.790871,...,-0.021664,-0.168552,0.282626,0.111297,-0.492715,0.779755,0.023864,0.133686,105.13,0.0
69412,53411,1.194319,0.184864,0.380408,0.373276,-0.018658,0.005934,-0.078417,0.062905,-0.296652,...,-0.190932,-0.524439,0.103525,-0.295192,0.18003,0.124729,-0.012819,0.003846,3.58,0.0
69413,53411,0.898915,-0.209692,0.537196,1.56296,-0.747575,-0.824138,0.249155,-0.168179,0.248011,...,-0.103293,-0.539026,-0.064167,0.697779,0.41465,-0.532499,-0.002198,0.054169,154.42,0.0
69414,53411,1.356225,-0.322399,0.450949,-0.345147,-0.784041,-0.434317,-0.485366,-0.038548,-1.212064,...,,,,,,,,,,


In [6]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69415 entries, 0 to 69414
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    69415 non-null  int64  
 1   V1      69415 non-null  float64
 2   V2      69415 non-null  float64
 3   V3      69415 non-null  float64
 4   V4      69415 non-null  float64
 5   V5      69415 non-null  float64
 6   V6      69415 non-null  float64
 7   V7      69415 non-null  float64
 8   V8      69415 non-null  float64
 9   V9      69415 non-null  float64
 10  V10     69415 non-null  float64
 11  V11     69415 non-null  float64
 12  V12     69415 non-null  float64
 13  V13     69415 non-null  float64
 14  V14     69414 non-null  float64
 15  V15     69414 non-null  float64
 16  V16     69414 non-null  float64
 17  V17     69414 non-null  float64
 18  V18     69414 non-null  float64
 19  V19     69414 non-null  float64
 20  V20     69414 non-null  float64
 21  V21     69414 non-null  float64
 22

In [7]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [8]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,69241
1.0,173


This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [9]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [10]:
print(legit.shape)
print(fraud.shape)

(69241, 31)
(173, 31)


In [11]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,69241.0
mean,96.985632
std,271.371862
min,0.0
25%,7.68
50%,26.5
75%,88.5
max,19656.53


In [12]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,173.0
mean,95.72237
std,221.11352
min,0.0
25%,1.0
50%,7.58
75%,99.99
max,1809.68


In [13]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,34530.112737,-0.227869,-0.033289,0.706865,0.155935,-0.258839,0.107358,-0.097214,0.048905,0.030613,...,0.046047,-0.030834,-0.106006,-0.037728,0.006142,0.136117,0.021729,0.001667,0.003137,96.985632
1.0,30035.913295,-6.933421,5.000911,-9.340224,5.397284,-5.161426,-2.120226,-7.252875,3.426366,-3.228419,...,0.429221,0.836306,-0.201712,-0.245147,-0.072067,0.249135,0.109489,0.565158,0.041636,95.72237


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 173

In [15]:
legit_sample = legit.sample(n=173)

Concatenating two DataFrames

In [16]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [17]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
46824,42945,1.201245,-0.599558,0.604505,-0.807314,-0.434032,0.82305,-0.858524,0.365723,-0.860695,...,0.046794,0.191504,0.299998,-0.639175,-0.112026,-0.372801,0.085434,0.011683,6.41,0.0
34764,37815,-3.115607,2.350526,-1.168778,-1.069185,0.655562,4.63667,-3.452756,-2.939704,-1.259507,...,-2.524628,0.275669,0.246702,0.978907,0.17534,0.229465,-0.338795,-0.08084,54.67,0.0
9359,13645,-1.358037,0.500959,1.958296,-0.771024,0.921045,0.863956,0.615123,0.140063,1.797511,...,-0.185195,0.012931,0.140723,-0.501701,-0.407839,-0.0688,-0.468248,-0.1226,9.99,0.0
72,47,1.197839,0.236828,0.509605,0.657659,-0.365488,-0.745101,0.079497,-0.130536,-0.052263,...,-0.149876,-0.374211,0.145516,0.414984,0.212156,0.181724,-0.016402,0.015994,3.63,0.0
19544,30368,-0.692024,1.260358,0.889543,0.121243,-0.443328,-1.056275,0.288577,0.433173,-0.348044,...,-0.233107,-0.81857,0.079555,0.29264,-0.181515,0.078915,0.107603,0.033855,12.96,0.0


In [18]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
64460,51155,-11.205461,7.914633,-13.987752,4.333341,-8.48497,-3.506561,-8.935243,7.704449,-2.336584,...,0.942593,-0.987848,-0.279446,-0.027299,0.644344,-0.263078,1.084023,0.211933,99.99,1.0
68067,52814,-1.101847,-1.632441,0.901067,0.847753,-1.249091,0.654937,1.448868,0.023308,-0.136742,...,0.610654,0.835795,1.179955,-0.029091,-0.300896,0.699175,-0.336072,-0.177587,519.9,1.0
68320,52934,1.036639,0.407227,0.757706,3.161821,-0.568122,0.202181,-0.689804,0.41138,0.336769,...,-0.050108,0.123761,-0.132568,0.350231,0.507701,0.189621,0.061016,0.063141,0.76,1.0
68522,53031,0.206075,1.38736,-1.045287,4.228686,-1.647549,-0.180897,-2.943678,0.859156,-1.181743,...,0.469199,0.34493,-0.203799,0.37664,0.715485,0.226003,0.628545,0.319918,0.76,1.0
68633,53076,1.296231,0.417447,0.193963,0.901644,0.130531,-0.371634,0.158126,-0.202669,-0.079512,...,-0.112114,-0.220002,-0.121022,-0.440454,0.67154,-0.413518,0.032838,0.0206,1.18,1.0


In [19]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,173
1.0,173


In [20]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,34198.83237,0.020329,-0.00043,0.625879,0.102192,-0.327481,0.053088,-0.081646,-0.055824,-0.055593,...,0.103531,-0.106204,-0.137687,-0.023064,0.014128,0.143148,0.094142,-0.028233,0.029787,98.687803
1.0,30035.913295,-6.933421,5.000911,-9.340224,5.397284,-5.161426,-2.120226,-7.252875,3.426366,-3.228419,...,0.429221,0.836306,-0.201712,-0.245147,-0.072067,0.249135,0.109489,0.565158,0.041636,95.72237


Splitting the data into Features & Targets

In [21]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [22]:
print(X)

        Time         V1        V2         V3        V4        V5        V6  \
46824  42945   1.201245 -0.599558   0.604505 -0.807314 -0.434032  0.823050   
34764  37815  -3.115607  2.350526  -1.168778 -1.069185  0.655562  4.636670   
9359   13645  -1.358037  0.500959   1.958296 -0.771024  0.921045  0.863956   
72        47   1.197839  0.236828   0.509605  0.657659 -0.365488 -0.745101   
19544  30368  -0.692024  1.260358   0.889543  0.121243 -0.443328 -1.056275   
...      ...        ...       ...        ...       ...       ...       ...   
64460  51155 -11.205461  7.914633 -13.987752  4.333341 -8.484970 -3.506561   
68067  52814  -1.101847 -1.632441   0.901067  0.847753 -1.249091  0.654937   
68320  52934   1.036639  0.407227   0.757706  3.161821 -0.568122  0.202181   
68522  53031   0.206075  1.387360  -1.045287  4.228686 -1.647549 -0.180897   
68633  53076   1.296231  0.417447   0.193963  0.901644  0.130531 -0.371634   

             V7        V8        V9  ...       V20       V21   

In [23]:
print(Y)

46824    0.0
34764    0.0
9359     0.0
72       0.0
19544    0.0
        ... 
64460    1.0
68067    1.0
68320    1.0
68522    1.0
68633    1.0
Name: Class, Length: 346, dtype: float64


Split the data into Training data & Testing Data

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(346, 30) (276, 30) (70, 30)


Model Training

Logistic Regression

In [26]:
model = LogisticRegression()

In [27]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [28]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [29]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9528985507246377


In [30]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.8714285714285714
