# Imports & Data Preprocessing

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [34]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('C:\\Users\\gedda\\OneDrive\\Desktop\\ml_project\\dataset\\creditcard.csv')
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.36,-0.07,2.54,1.38,-0.34,0.46,0.24,0.1,0.36,...,-0.02,0.28,-0.11,0.07,0.13,-0.19,0.13,-0.02,149.62,0
1,0.0,1.19,0.27,0.17,0.45,0.06,-0.08,-0.08,0.09,-0.26,...,-0.23,-0.64,0.1,-0.34,0.17,0.13,-0.01,0.01,2.69,0
2,1.0,-1.36,-1.34,1.77,0.38,-0.5,1.8,0.79,0.25,-1.51,...,0.25,0.77,0.91,-0.69,-0.33,-0.14,-0.06,-0.06,378.66,0
3,1.0,-0.97,-0.19,1.79,-0.86,-0.01,1.25,0.24,0.38,-1.39,...,-0.11,0.01,-0.19,-1.18,0.65,-0.22,0.06,0.06,123.5,0
4,2.0,-1.16,0.88,1.55,0.4,-0.41,0.1,0.59,-0.27,0.82,...,-0.01,0.8,-0.14,0.14,-0.21,0.5,0.22,0.22,69.99,0


In [35]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.88,10.07,-9.83,-2.07,-5.36,-2.61,-4.92,7.31,1.91,...,0.21,0.11,1.01,-0.51,1.44,0.25,0.94,0.82,0.77,0
284803,172787.0,-0.73,-0.06,2.04,-0.74,0.87,1.06,0.02,0.29,0.58,...,0.21,0.92,0.01,-1.02,-0.61,-0.4,0.07,-0.05,24.79,0
284804,172788.0,1.92,-0.3,-3.25,-0.56,2.63,3.03,-0.3,0.71,0.43,...,0.23,0.58,-0.04,0.64,0.27,-0.09,0.0,-0.03,67.88,0
284805,172788.0,-0.24,0.53,0.7,0.69,-0.38,0.62,-0.69,0.68,0.39,...,0.27,0.8,-0.16,0.12,-0.57,0.55,0.11,0.1,10.0,0
284806,172792.0,-0.53,-0.19,0.7,-0.51,-0.01,-0.65,1.58,-0.41,0.49,...,0.26,0.64,0.38,0.01,-0.47,-0.82,-0.0,0.01,217.0,0


In [36]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [37]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [38]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [39]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [40]:
fraud.Amount.describe()

count    492.00
mean     122.21
std      256.68
min        0.00
25%        1.00
50%        9.25
75%      105.89
max     2125.87
Name: Amount, dtype: float64

In [41]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.2,0.01,-0.01,0.01,-0.01,0.01,0.0,0.01,-0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,88.29
1,80746.81,-4.77,3.62,-7.03,4.54,-3.15,-1.4,-5.57,0.57,-2.58,...,0.37,0.71,0.01,-0.04,-0.11,0.04,0.05,0.17,0.08,122.21


In [42]:
legit_sample = legit.sample(n=492)

In [43]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
74964,55830.0,1.08,-0.19,1.36,1.39,-1.1,0.05,-0.74,0.3,1.11,...,-0.03,0.07,0.08,0.37,0.24,-0.39,0.08,0.03,9.99,0
91893,63672.0,-1.69,0.22,2.12,-0.66,-1.09,-0.82,-0.16,0.44,0.64,...,-0.11,-0.21,0.2,0.99,-0.18,0.74,-0.28,-0.02,39.77,0
68050,52808.0,-0.65,1.21,1.29,0.07,-0.08,-0.58,0.27,-0.68,-0.19,...,0.54,-0.87,0.14,0.31,-0.3,0.09,0.29,0.1,2.69,0
47562,43258.0,-0.99,0.78,0.43,-0.37,-0.11,-0.47,1.12,0.11,-0.0,...,0.04,0.43,0.18,0.11,-0.42,0.29,0.01,-0.11,106.91,0
203256,134737.0,1.99,-0.4,-0.43,0.39,-0.48,-0.13,-0.64,0.02,1.42,...,0.18,0.76,0.14,0.69,-0.05,-0.24,0.04,-0.03,9.99,0


In [44]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.93,1.13,-4.52,1.75,-1.57,-2.01,-0.88,0.7,-2.06,...,0.78,-0.32,0.64,-0.29,0.54,0.79,0.29,0.15,390.0,1
280143,169347.0,1.38,1.29,-5.0,1.41,0.44,-1.33,-1.41,0.25,-1.13,...,0.37,0.03,-0.15,-0.08,0.52,0.74,0.39,0.19,0.76,1
280149,169351.0,-0.68,1.13,-2.21,0.47,-1.12,-0.0,-2.23,1.21,-0.65,...,0.75,0.83,0.19,0.03,-0.74,0.47,0.39,0.19,77.89,1
281144,169966.0,-3.11,0.59,-5.4,1.82,-0.84,-2.94,-2.21,1.06,-1.63,...,0.58,-0.27,-0.46,-0.18,-0.33,0.61,0.88,-0.25,245.0,1
281674,170348.0,1.99,0.16,-2.58,0.41,1.15,-0.1,0.22,-0.07,0.58,...,-0.16,-0.3,-0.07,-0.45,0.31,-0.29,0.0,-0.02,42.53,1


In [45]:
new_dataset['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [46]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94752.08,-0.26,-0.06,-0.06,0.17,-0.01,-0.02,-0.03,0.0,-0.01,...,-0.05,-0.0,-0.02,-0.05,0.04,0.01,-0.01,-0.01,-0.01,89.13
1,80746.81,-4.77,3.62,-7.03,4.54,-3.15,-1.4,-5.57,0.57,-2.58,...,0.37,0.71,0.01,-0.04,-0.11,0.04,0.05,0.17,0.08,122.21


In [47]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


# Logistic Regression

In [48]:
model = LogisticRegression()
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

In [49]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9390088945362135


In [50]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9086294416243654
