In [2]:
# Packages related to general operating system & warnings
import os
import warnings
warnings.filterwarnings('ignore')

# Packages related to data importing, manipulation, exploratory data analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from termcolor import colored as cl  # Text customization

# Packages related to data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6, 3))
plt.gray()

# Packages related to model selection, evaluation, and preprocessing
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import (
    PolynomialFeatures, KBinsDiscretizer, FunctionTransformer,
    StandardScaler, MinMaxScaler, MaxAbsScaler,
    LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
)
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa

# Packages related to machine learning algorithms
from sklearn.linear_model import (
    LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
)


from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import (
    BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor
)



from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier

# Evaluation metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<Figure size 600x300 with 0 Axes>

In [4]:
csv_path = '/kaggle/input/creditcardfraud/creditcard.csv'
data=pd.read_csv(csv_path)

include the path 

In [9]:
print(data.head())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [14]:
# Calculate the total number of transactions
Total_transactions = len(data)

# Calculate the number of normal transactions
normal = len(data[data.Class == 0])

# Calculate the number of fraudulent transactions
fraudulent = len(data[data.Class == 1])

# Calculate the percentage of fraudulent transactions
fraud_percentage = round(fraudulent / normal * 100, 2)

print(cl('Total number of Transactions: {}'))

print(cl('Number of Normal Transactions: {}'))

print(cl('Number of Fraudulent Transactions: {}'))

print(cl('Percentage of Fraudulent Transactions: {}%'.format(fraud_percentage), attrs=['bold']))


Total number of Transactions: {}[0m
Number of Normal Transactions: {}[0m
Number of Fraudulent Transactions: {}[0m
[1mPercentage of Fraudulent Transactions: 0.17%[0m


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [16]:
max(data.Amount),min(data.Amount)

(25691.16, 0.0)

In [17]:
# Create an instance of StandardScaler
sc = StandardScaler()

# Get the values of the 'Amount' column
amount = data['Amount'].values

# Reshape the 'amount' array to have a single column (-1 indicates the number of rows is inferred)
amount_reshaped = amount.reshape(-1, 1)

# Apply standardization to the 'Amount' column and replace the original values
data['Amount'] = sc.fit_transform(amount_reshaped)


In [18]:
# Drop the 'Time' column from the DataFrame 'data'
data.drop(['Time'], axis=1, inplace=True)


In [21]:
data.shape

(275663, 30)

this was used multiple times

In [20]:
# Remove duplicate rows from the DataFrame 'data'
data.drop_duplicates(inplace=True)

In [22]:
# Define the independent variable (X) and dependent variable (y)
X = data.drop('Class', axis=1).values
y = data['Class'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the training and testing sets
print(cl('Training set shape:', attrs=['bold']), X_train.shape, y_train.shape)
print(cl('Testing set shape:', attrs=['bold']), X_test.shape, y_test.shape)


[1mTraining set shape:[0m (220530, 29) (220530,)
[1mTesting set shape:[0m (55133, 29) (55133,)


In [None]:
KNN MODEL
In this code, we define the value of n as 7, indicating the number of neighbors to consider for classification in the KNN model. We then create an instance of the KNeighborsClassifier class with the specified number of neighbors. The model is trained using the fit() function with the training data (X_train and y_train).

Next, we use the trained model to predict the labels for the test data (X_test) using the predict() function, and store the predicted labels in knn_yhat.

Finally, we calculate the accuracy score of the KNN model by comparing the predicted labels (knn_yhat) with the true labels from the test data (y_test) using the accuracy_score() function. The accuracy score is then printed.

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
n = 7
KNN = KNeighborsClassifier(n_neighbors=n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

accuracy = accuracy_score(y_test, knn_yhat)
print('Accuracy score of the K-Nearest Neighbors model is {}'.format(accuracy))


Accuracy score of the K-Nearest Neighbors model is 0.9994014474089928


In [27]:
f1 = f1_score(y_test, knn_yhat)
print('F1 score of the K-Nearest Neighbors model is {}'.format(f1))

F1 score of the K-Nearest Neighbors model is 0.804733727810651


XG boost

In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize the XGBoost classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_yhat = xgb_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, xgb_yhat)
print('Accuracy score of the XGBoost model is {}'.format(accuracy))


Accuracy score of the XGBoost model is 0.9995465510674187


In [29]:
f1 = f1_score(y_test, xgb_yhat)
print('F1 score of the XGBoost model is {}'.format(f1))

F1 score of the XGBoost model is 0.8484848484848485


ANN lets go

In [30]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score

# Initialize the ANN model
ann_model = Sequential()

# Add input layer and first hidden layer
ann_model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

# Add additional hidden layers
ann_model.add(Dense(units=32, activation='relu'))
ann_model.add(Dense(units=16, activation='relu'))

# Add output layer
ann_model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
ann_model.fit(X_train, y_train, epochs=10, batch_size=32)

# Make predictions on the test set
ann_yhat = ann_model.predict_classes(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, ann_yhat)
print('F1 score of the ANN model is {}'.format(f1))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [34]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score

# Initialize the ANN model
ann_model = Sequential()

# Add input layer and first hidden layer
ann_model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

# Add additional hidden layers
ann_model.add(Dense(units=32, activation='relu'))
ann_model.add(Dense(units=16, activation='relu'))

# Add output layer
ann_model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
ann_model.fit(X_train, y_train, epochs=12, batch_size=64)

# Make predictions on the test set
ann_yhat = ann_model.predict(X_test)
ann_yhat = (ann_yhat > 0.5).astype(int)

# Calculate the F1 score
f1 = f1_score(y_test, ann_yhat)
print('F1 score of the ANN model is {}'.format(f1))


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
F1 score of the ANN model is 0.7466666666666667


epochs = 10 , batch size =32, F1 score = 0.74