In [1]:
#importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing the dataset
df = pd.read_csv("C:/Users/K L narasaiah/Downloads/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
#checking the shape of the data
print('shape :', df.shape)
print('rows :', df.shape[0])
print('Columns :', df.shape[1])

shape : (284807, 31)
rows : 284807
Columns : 31


In [4]:
#checking for the null values
df.isnull().sum().any()

False

In [5]:
#looking for the datatypes of the each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
#checking for the duplicates
df.duplicated().sum()

1081

In [7]:
#dropping the duplicates
df.drop_duplicates(inplace = True)

In [8]:
#checking for the duplicates
df.duplicated().sum()

0

In [9]:
#normalizing the amount column
df['normAmount'] = StandardScaler().fit_transform(np.array(df['Amount']).reshape(-1, 1))

#dropping the time and amount column
df.drop(['Time', 'Amount'], inplace = True, axis = 1)

In [10]:
#checking for the proportion of class target in the data
df['Class'].value_counts(normalize = True).round(4)

Class
0    0.9983
1    0.0017
Name: proportion, dtype: float64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   V1          283726 non-null  float64
 1   V2          283726 non-null  float64
 2   V3          283726 non-null  float64
 3   V4          283726 non-null  float64
 4   V5          283726 non-null  float64
 5   V6          283726 non-null  float64
 6   V7          283726 non-null  float64
 7   V8          283726 non-null  float64
 8   V9          283726 non-null  float64
 9   V10         283726 non-null  float64
 10  V11         283726 non-null  float64
 11  V12         283726 non-null  float64
 12  V13         283726 non-null  float64
 13  V14         283726 non-null  float64
 14  V15         283726 non-null  float64
 15  V16         283726 non-null  float64
 16  V17         283726 non-null  float64
 17  V18         283726 non-null  float64
 18  V19         283726 non-null  float64
 19  V20    

In [12]:
#we have 99.83% of the data from class 0 and 0.17% of the data from class 1
#we need to use class imbalance technique to balance the class 0 and class 1

In [13]:
#let us now use SMOTE to balance the target variable

In [14]:
#importing important libraries
from imblearn.over_sampling import SMOTE

In [15]:
#separating the input features and target variable
x = df.drop('Class', axis = 1)
y = df['Class']

In [16]:
#splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 5)

In [17]:
# Using SMOTE to handle imbalance in the data
print('Before SMOTE, count of label 1 :', sum(y_train == 1))
print('Before SMOTE, count of label 0 :', sum(y_train == 0))

# SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 5)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)
print('\nAfter SMOTE, count of label 1 :', sum(y_train_res == 1))
print('After SMOTE, count of label 0 :', sum(y_train_res == 0))

Before SMOTE, count of label 1 : 381
Before SMOTE, count of label 0 : 226599


  File "C:\Users\K L narasaiah\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\K L narasaiah\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\K L narasaiah\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\K L narasaiah\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



After SMOTE, count of label 1 : 226599
After SMOTE, count of label 0 : 226599


In [18]:
#creating the GBM model
gbm_clf = GradientBoostingClassifier()

#fitting the model
gbm_clf.fit(x_train_res, y_train_res)

#getting the predictions from the test model
y_pred = gbm_clf.predict(x_test)

#getting the predictions from the training data
y_train_pred = gbm_clf.predict(x_train_res)

In [19]:
# Model Evaluation 
print('Testing Accuracy :', metrics.accuracy_score(y_test, y_pred))
print('Training Accuracy :', metrics.accuracy_score(y_train_res, y_train_pred))

Testing Accuracy : 0.9863074049272196
Training Accuracy : 0.9790841971941624


In [22]:
#Reshaping for the LSTM inputs(sample, timesteps, features)
x_train_res_reshaped = x_train_res.values.reshape(x_train_res.shape[0], 1, x_train_res.shape[1])
x_test_reshaped = x_test.values.reshape(x_test.shape[0], 1, x_test.shape[1])

#importing tensorflow/keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

#define the LSTM model
model = Sequential([
    LSTM(64, return_sequences = True, input_shape = (1, x_train_res.shape[1])),
    Dropout(0.2),
    LSTM(32, return_sequences = False),
    Dropout(0.2),
    Dense(16, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

#compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#train the model
history = model.fit(x_train_res_reshaped, y_train_res, epochs = 10, batch_size = 64, validation_data = (x_test_reshaped, y_test))

#evaluate the model
loss, accuracy = model.evaluate(x_test_reshaped, y_test)
print(f"Test Accuracy : {accuracy:.4f}")

#making predictions from the model
y_pred = (model.predict(x_test_reshaped) > 0.5).astype("int32")

#performance metrics
from sklearn.metrics import classification_report, confusion_matrix

print("Confusion matrix :\n", confusion_matrix(y_test, y_pred))
print("Classification report :\n", classification_report(y_test, y_pred))

Epoch 1/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.9726 - loss: 0.0796 - val_accuracy: 0.9959 - val_loss: 0.0137
Epoch 2/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.9970 - loss: 0.0107 - val_accuracy: 0.9980 - val_loss: 0.0094
Epoch 3/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9981 - loss: 0.0066 - val_accuracy: 0.9981 - val_loss: 0.0092
Epoch 4/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9987 - loss: 0.0050 - val_accuracy: 0.9984 - val_loss: 0.0081
Epoch 5/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9989 - loss: 0.0042 - val_accuracy: 0.9991 - val_loss: 0.0057
Epoch 6/10
[1m7082/7082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9991 - loss: 0.0034 - val_accuracy: 0.9989 - val_loss: 0.0068
Epoch 7/10