Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')  # Correct the file path if needed

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
15931,27369,-1.160116,-0.244177,0.74425,-0.19235,1.156356,-1.931383,0.40967,-0.364716,-0.516156,...,-0.021714,0.012447,-0.360625,0.514926,0.064194,1.026317,-0.030845,-0.083609,52.9,0.0
15932,27369,-3.058318,3.099206,-4.932555,1.924138,-1.576032,-2.135383,-0.830098,2.228617,-0.312343,...,-0.111526,-0.485223,-0.003945,0.049422,-0.173962,-0.37926,0.171872,-0.236166,99.99,0.0
15933,27369,-0.661806,0.315385,2.011194,-0.438757,-0.55499,-0.668072,0.424651,0.079141,0.126057,...,0.148063,0.177511,0.134794,0.359931,-0.544428,0.181545,0.068546,0.153438,74.58,0.0
15934,27370,1.525348,-1.231442,0.420095,-1.551218,-1.376006,0.100758,-1.455755,0.134876,-1.319056,...,-0.108619,0.084883,-0.051758,-0.815038,0.331989,-0.015837,0.058942,0.011087,6.0,0.0
15935,27371,1.38568,-0.590076,-0.569197,-0.939441,-0.196015,-0.486685,-0.102496,-0.23793,-0.928028,...,-0.255944,-1.055682,,,,,,,,


In [None]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15936 entries, 0 to 15935
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    15936 non-null  int64  
 1   V1      15936 non-null  float64
 2   V2      15936 non-null  float64
 3   V3      15936 non-null  float64
 4   V4      15936 non-null  float64
 5   V5      15936 non-null  float64
 6   V6      15936 non-null  float64
 7   V7      15936 non-null  float64
 8   V8      15936 non-null  float64
 9   V9      15936 non-null  float64
 10  V10     15936 non-null  float64
 11  V11     15936 non-null  float64
 12  V12     15936 non-null  float64
 13  V13     15936 non-null  float64
 14  V14     15936 non-null  float64
 15  V15     15936 non-null  float64
 16  V16     15936 non-null  float64
 17  V17     15936 non-null  float64
 18  V18     15936 non-null  float64
 19  V19     15936 non-null  float64
 20  V20     15936 non-null  float64
 21  V21     15936 non-null  float64
 22

In [None]:
# checking the number of missing values in each coloumn
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0.0    15862
1.0       73
Name: count, dtype: int64

This Dataset is highly unbalanced


0 --> Normal Transaction

1 --> Fraudulent Transaction

In [None]:
# seperating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(15862, 31)
(73, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

count    15862.000000
mean        66.280151
std        188.898885
min          0.000000
25%          5.522500
50%         15.950000
75%         53.890000
max       7712.430000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count      73.000000
mean       90.307123
std       271.634360
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,12104.432165,-0.219072,0.25,0.862854,0.272641,-0.105868,0.124522,-0.112681,-0.016178,0.87912,...,0.028173,-0.064056,-0.161585,-0.03504,0.013076,0.117023,0.037111,0.011206,0.005907,66.280151
1.0,15559.643836,-7.929807,6.19312,-11.997831,6.55505,-5.474984,-2.480356,-8.354317,3.668478,-3.086988,...,0.671905,0.4411,-0.295856,-0.358994,-0.328925,0.216076,0.209323,0.927187,0.06138,90.307123


Under-Sampling


Build a sample dataset contatining similar distribution of normal transactions and Fraudulent Transactions


Number of Fraudulent Transactions --> 492


In [None]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
12061,20912,-0.461802,1.005206,2.814234,2.193736,0.047832,1.101699,-0.422002,0.374782,0.238277,...,0.123298,0.545921,-0.375994,-0.546862,0.218166,0.242603,0.006806,-0.015664,0.01,0.0
1380,1073,-1.133952,-0.298663,1.174211,-2.590226,-0.075722,0.11319,1.253951,-2.4e-05,0.995004,...,0.064651,0.097752,-0.03667,-0.71102,0.88235,-0.00263,-0.028453,0.055172,202.37,0.0
1857,1436,-0.950548,0.672939,1.487102,-0.154086,0.142597,-0.303364,0.615363,-0.064484,-0.007532,...,-0.149954,-0.266863,0.234936,0.161517,-0.833377,-0.052547,-0.161532,-0.022046,15.98,0.0
9203,13194,0.888869,-0.399066,0.996928,1.616921,-0.797793,0.122022,-0.448099,-0.021827,2.02489,...,0.038834,0.164248,-0.281844,-0.152696,0.487268,-0.267095,0.009268,0.052751,169.0,0.0
975,738,0.82841,-1.015539,0.494973,0.132272,-0.637768,0.744833,-0.386336,0.130129,1.05698,...,-0.127586,-0.487869,-0.21128,-0.766016,0.156877,1.005284,-0.050482,0.040687,220.28,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
15566,26961,-23.23792,13.487386,-25.188773,6.261733,-17.345188,-4.534989,-17.100492,15.37463,-3.845567,...,1.769708,-1.691973,-1.045673,0.143386,1.611577,-0.221576,1.481233,0.438125,99.99,1.0
15736,27163,-23.914101,13.765942,-25.733734,6.290918,-17.784824,-4.572498,-17.390764,15.794136,-3.819832,...,1.773399,-1.748426,-1.093104,0.147807,1.663792,-0.221024,1.458076,0.430315,99.99,1.0
15751,27187,-24.590245,14.044567,-26.278701,6.320089,-18.224513,-4.609968,-17.681003,16.213627,-3.794093,...,1.777063,-1.804874,-1.140605,0.152234,1.715997,-0.220471,1.434951,0.422492,99.99,1.0
15781,27219,-25.266355,14.323254,-26.823673,6.349248,-18.664251,-4.647403,-17.971212,16.633103,-3.768351,...,1.780701,-1.861318,-1.188167,0.156667,1.768192,-0.219916,1.411855,0.414656,99.99,1.0
15810,27252,-25.942434,14.601998,-27.36865,6.378395,-19.104033,-4.684806,-18.261393,17.052566,-3.742605,...,1.784316,-1.917759,-1.235787,0.161105,1.820378,-0.219359,1.388786,0.40681,99.99,1.0


In [None]:
new_dataset['Class'].value_counts()

Class
0.0    492
1.0     73
Name: count, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,12267.026423,-0.211173,0.225023,0.805806,0.194165,-0.176538,0.064753,-0.191409,-0.005648,0.902081,...,-0.018151,-0.057612,-0.148116,-0.04969,0.010589,0.144775,0.043202,0.045742,0.019179,63.341098
1.0,15559.643836,-7.929807,6.19312,-11.997831,6.55505,-5.474984,-2.480356,-8.354317,3.668478,-3.086988,...,0.671905,0.4411,-0.295856,-0.358994,-0.328925,0.216076,0.209323,0.927187,0.06138,90.307123


Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

        Time         V1         V2         V3        V4         V5        V6  \
12061  20912  -0.461802   1.005206   2.814234  2.193736   0.047832  1.101699   
1380    1073  -1.133952  -0.298663   1.174211 -2.590226  -0.075722  0.113190   
1857    1436  -0.950548   0.672939   1.487102 -0.154086   0.142597 -0.303364   
9203   13194   0.888869  -0.399066   0.996928  1.616921  -0.797793  0.122022   
975      738   0.828410  -1.015539   0.494973  0.132272  -0.637768  0.744833   
...      ...        ...        ...        ...       ...        ...       ...   
15566  26961 -23.237920  13.487386 -25.188773  6.261733 -17.345188 -4.534989   
15736  27163 -23.914101  13.765942 -25.733734  6.290918 -17.784824 -4.572498   
15751  27187 -24.590245  14.044567 -26.278701  6.320089 -18.224513 -4.609968   
15781  27219 -25.266355  14.323254 -26.823673  6.349248 -18.664251 -4.647403   
15810  27252 -25.942434  14.601998 -27.368650  6.378395 -19.104033 -4.684806   

              V7         V8        V9  

In [None]:
print(Y)

12061    0.0
1380     0.0
1857     0.0
9203     0.0
975      0.0
        ... 
15566    1.0
15736    1.0
15751    1.0
15781    1.0
15810    1.0
Name: Class, Length: 565, dtype: float64


Split the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)


(565, 30) (452, 30) (113, 30)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)


Accuracy on Training data :  0.9734513274336283


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)


Accuracy score on Test Data :  0.9292035398230089
