Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_card.csv')

In [3]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
128816,78851,-2.723981,2.153142,0.66505,1.634188,-1.738064,0.66944,-1.3488,2.025402,0.363866,...,-0.246869,-0.527553,0.130251,0.095408,0.140019,-0.378158,-0.083672,0.033943,21.99,0.0
128817,78852,1.017236,-0.376308,0.628701,1.329149,0.024834,1.73703,-0.527988,0.453694,0.950547,...,-0.407126,-0.818995,-0.1595,-1.321049,0.583299,-0.451452,0.07998,0.016907,70.57,0.0
128818,78852,1.185877,-0.11081,1.060111,1.237102,-0.824073,0.056132,-0.629817,0.172108,1.061852,...,-0.104999,-0.137867,-0.056196,-0.141476,0.450476,-0.374782,0.065924,0.031338,9.99,0.0
128819,78852,1.307395,-1.223997,0.278674,-1.414282,-1.431755,-0.528526,-0.871493,0.050584,-2.094747,...,-0.535638,-1.466298,0.30029,0.09946,-0.122834,-0.643175,0.007952,0.020318,75.2,0.0
128820,78852,-0.772608,0.88671,0.699519,1.757457,-0.504902,-0.963725,0.176063,0.15652,-0.654364,...,,,,,,,,,,


In [5]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128821 entries, 0 to 128820
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    128821 non-null  int64  
 1   V1      128821 non-null  float64
 2   V2      128821 non-null  float64
 3   V3      128821 non-null  float64
 4   V4      128821 non-null  float64
 5   V5      128821 non-null  float64
 6   V6      128821 non-null  float64
 7   V7      128821 non-null  float64
 8   V8      128821 non-null  float64
 9   V9      128821 non-null  float64
 10  V10     128821 non-null  float64
 11  V11     128821 non-null  float64
 12  V12     128821 non-null  float64
 13  V13     128821 non-null  float64
 14  V14     128821 non-null  float64
 15  V15     128821 non-null  float64
 16  V16     128820 non-null  float64
 17  V17     128820 non-null  float64
 18  V18     128820 non-null  float64
 19  V19     128820 non-null  float64
 20  V20     128820 non-null  float64
 21  V21     12

In [6]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [7]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,128559
1.0,261


This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [8]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
print(legit.shape)
print(fraud.shape)

(128559, 31)
(261, 31)


In [10]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,128559.0
mean,92.909269
std,251.540738
min,0.0
25%,6.5
50%,24.95
75%,83.19
max,19656.53


In [11]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,261.0
mean,116.679693
std,246.300626
min,0.0
25%,1.0
50%,11.38
75%,99.99
max,1809.68


In [12]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,49351.046259,-0.238744,-0.007156,0.691484,0.13934,-0.276329,0.085922,-0.104433,0.059177,-0.08367,...,0.042065,-0.039697,-0.115701,-0.034179,0.012188,0.130042,0.025932,-0.000691,0.002019,92.909269
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [13]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [14]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [15]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124752,77412,-1.039598,-0.115511,1.856568,0.825019,-2.219317,1.082903,1.819508,-0.386866,0.461548,...,-0.091,0.6179,0.454559,0.684661,-0.759186,0.285659,0.105532,-0.235725,410.04,0.0
80663,58591,1.085794,-0.350393,1.474352,1.684647,-1.086247,0.695354,-0.97572,0.42441,1.596005,...,0.057725,0.620561,-0.122463,0.096457,0.580172,-0.088565,0.101712,0.025771,2.0,0.0
4169,3749,1.414568,-0.661995,-0.907428,-1.621892,1.465019,3.272285,-1.179459,0.712131,0.360914,...,-0.282256,-0.894647,0.087275,0.91967,0.359542,-0.507652,-0.018637,0.014249,35.12,0.0
61600,49907,1.151897,0.363039,1.043242,2.522762,-0.587741,-0.321808,-0.282419,0.043651,-0.315436,...,0.133921,0.319213,-0.051721,0.373224,0.403154,0.099236,0.005843,0.028957,10.65,0.0
98805,66826,-1.947718,2.205942,1.024027,-0.307804,-0.300403,0.392007,-1.033177,-4.629127,-0.307216,...,4.410957,-2.016382,0.64647,0.233667,-0.095454,0.090194,0.47257,0.158928,2.68,0.0


In [16]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124087,77171,1.11856,1.291858,-1.298805,2.135772,0.772204,-1.147291,0.390578,-0.107072,-0.038339,...,-0.346374,-0.663588,-0.102326,0.017911,0.650302,-0.332366,0.105949,0.128124,1.0,1.0
124115,77182,-1.410852,2.268271,-2.297554,1.871331,0.248957,-1.208799,-1.358648,1.102916,-1.317364,...,0.155381,-0.61488,-0.196126,-0.464376,0.118473,-0.484537,0.373596,0.187657,1.0,1.0
124176,77202,-0.356326,1.435305,-0.813564,1.993117,2.055878,-0.543579,0.487691,0.085449,-0.536352,...,-0.312863,-0.687874,-0.267003,-1.15848,0.27146,-0.155397,0.114328,0.101526,1.0,1.0
125342,77627,-7.13906,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,-3.053493,...,1.30325,-0.016118,-0.87667,0.38223,-1.054624,-0.614606,-0.766848,0.409424,106.9,1.0
128479,78725,-4.312479,1.886476,-2.338634,-0.475243,-1.185444,-2.112079,-2.122793,0.272565,0.290273,...,0.550541,-0.06787,-1.114692,0.269069,-0.020572,-0.963489,-0.918888,0.001454,60.0,1.0


In [17]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,261


In [18]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,47965.873984,-0.307144,-0.107255,0.687206,0.238115,-0.384199,0.076577,-0.12347,0.125053,-0.048635,...,-0.001706,-0.01261,-0.082943,-0.0522,0.010893,0.133097,0.030216,0.029948,-0.052936,103.100102
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


Splitting the data into Features & Targets

In [19]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [20]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
124752  77412 -1.039598 -0.115511  1.856568  0.825019 -2.219317  1.082903   
80663   58591  1.085794 -0.350393  1.474352  1.684647 -1.086247  0.695354   
4169     3749  1.414568 -0.661995 -0.907428 -1.621892  1.465019  3.272285   
61600   49907  1.151897  0.363039  1.043242  2.522762 -0.587741 -0.321808   
98805   66826 -1.947718  2.205942  1.024027 -0.307804 -0.300403  0.392007   
...       ...       ...       ...       ...       ...       ...       ...   
124087  77171  1.118560  1.291858 -1.298805  2.135772  0.772204 -1.147291   
124115  77182 -1.410852  2.268271 -2.297554  1.871331  0.248957 -1.208799   
124176  77202 -0.356326  1.435305 -0.813564  1.993117  2.055878 -0.543579   
125342  77627 -7.139060  2.773082 -6.757845  4.446456 -5.464428 -1.713401   
128479  78725 -4.312479  1.886476 -2.338634 -0.475243 -1.185444 -2.112079   

              V7        V8        V9  ...       V20       V21       V22  \


In [21]:
print(Y)

124752    0.0
80663     0.0
4169      0.0
61600     0.0
98805     0.0
         ... 
124087    1.0
124115    1.0
124176    1.0
125342    1.0
128479    1.0
Name: Class, Length: 753, dtype: float64


Split the data into Training data & Testing Data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(753, 30) (602, 30) (151, 30)


Model Training

Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [26]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9634551495016611


In [28]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [29]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9205298013245033
