<H1><center>Credit Card Fraud Detection<center/></H1>
<H3><center>Project By: Fares Makki | Mohamed Taha Sta | Jesser Hamdi<center/></H3>
<H6><center><italic>CI-1 2023/2024</italic><center/></H6>

In [None]:
# Import the necessary libraries
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
from scipy.stats import norm
import matplotlib

warnings.filterwarnings("ignore")

##### A function to plot the confusion matrix.

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=None):
    # Create a custom colormap using the colors you specified
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#928AEE", "#0F0B38"])

    plt.rcParams.update({'font.size': 19})
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontdict={'size': '16'})
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=12, color="#0F0B38")
    plt.yticks(tick_marks, classes, fontsize=12, color="#0F0B38")
    # rc('font', weight='bold')
    fmt = '.1f'
    thresh = cm.max()

    # Calculate the percentages
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100

    # Plot the text on the cells
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "\n" + format(cm_perc[i, j], '.2f') + "%",
                 horizontalalignment="center",
                 color="white")
    plt.grid(False)
    plt.ylabel('True label', fontdict={'size': '16'})
    plt.xlabel('Predicted label', fontdict={'size': '16'})
    plt.tight_layout()

# Understanding the Data

In [None]:
# Load the data
data = pd.read_csv('creditcard.csv')

In [None]:
# Create a new feature 'Hour' from the 'Time' column
data['Hour'] = data['Time'].apply(lambda x: int(np.floor(x / 3600)))

# Visualize transactions by hour
sns.set()
palette = sns.color_palette(["#0F0B38", "#928AEE"])
sns.set_palette(palette)
sns.catplot(x='Hour', data=data, kind='count', hue='Class', height=5, aspect=3)
plt.title('Transactions Per Hour')
plt.show()

In [None]:
# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Plot the distribution of transaction amount and time for fraud and normal transactions
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 6))
bins = 100

# Plotting for Fraud transactions
ax1.hist(data.Hour[data.Class == 1], bins=bins, color=palette[0], alpha=1)
ax1.set_title('Fraud', fontsize=14)
ax1.set_ylabel('Number of Transactions', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.5)

# Plotting for Normal transactions
ax2.hist(data.Hour[data.Class == 0], bins=bins, color=palette[1], alpha=1)
ax2.set_title('Normal', fontsize=14)
ax2.set_xlabel('Time (in Hours)', fontsize=12)
ax2.set_ylabel('Number of Transactions', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.5)

# Remove top and right spines
for ax in [ax1, ax2]:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

### Understanding which normalisation method was used on the original dataset.

In [None]:
subset_of_features = ['V14', 'V12', 'V10', 'Class']
sns.pairplot(data.sample(1000)[subset_of_features], hue='Class')
plt.show()

### Having examined the different features in the dataset we came to the conclusion that we were fortunate enough not to have extreme cases of outliers:

In [None]:
# Plot the distribution of some features for fraud transactions
f, axes = plt.subplots(1, 3, figsize=(20, 6))

v14_fraud_dist = data['V14'].loc[data['Class'] == 1].values
sns.distplot(v14_fraud_dist, ax=axes[0], fit=norm, color=palette[1])
axes[0].set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)

v12_fraud_dist = data['V12'].loc[data['Class'] == 1].values
sns.distplot(v12_fraud_dist, ax=axes[1], fit=norm, color=palette[0])
axes[1].set_title('V12 Distribution \n (Fraud Transactions)', fontsize=14)

v10_fraud_dist = data['V10'].loc[data['Class'] == 1].values
sns.distplot(v10_fraud_dist, ax=axes[2], fit=norm, color='#232FF4')
axes[2].set_title('V10 Distribution \n (Fraud Transactions)', fontsize=14)

plt.show()

In [None]:
# Plot the class distribution
f, ax = plt.subplots(1, 2, figsize=(18, 8))
data['Class'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True, colors=palette[::-1])
ax[0].set_title('Class Distribution')
ax[0].set_ylabel('')
sns.countplot(x='Class', data=data, ax=ax[1], palette=palette[::-1])
ax[1].set_title('Distribution')
plt.show()

# Pre-Processing

In [None]:
# Remove duplicates 
data.duplicated()
data = data.drop_duplicates()

In [None]:
# Transform the data using StandardScaler
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])
data['Time'] = scaler.fit_transform(data[['Time']])

In [None]:
# Separate the data into normal and fraud
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

# Undersample the data to balance the classes
legit_sample = legit.sample(500)
new_data = pd.concat([legit_sample, fraud], axis=0)

# Plot the semi equally distributed classes
sns.countplot(x='Class', data=new_data, palette=palette)
plt.title('Semi Equally Distributed Classes', fontsize=14)
plt.show()

In [None]:
# Plot the correlation matrix for the new data
plt.figure(figsize=(24, 20))
sns.heatmap(new_data.corr(), cmap='coolwarm_r', annot_kws={'size': 20})
plt.title('Sample Correlation Matrix', fontsize=14)
plt.show()

We can here by notice that the high_corr_features, which we are going to label as such, are ['V17', 'V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V11', 'V4', 'V2'].
Which we are going to use later on for a second batch of models that we are going to run against the first batch which is going to be trained on all 30 variables.

In [None]:
# Split the data into features and labels
x = new_data.drop(columns='Class', axis=1)  # features
y = new_data['Class']  # label

# Split the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

# Models


### Take 1

In [None]:
# Define the Logistic Regression model
model1 = LogisticRegression()
# Fit the model on the training data
model1.fit(x_train, y_train)
# Predict on the testing data
y_pred1 = model1.predict(x_test)
# Predict on the training data
y_pred1_train = model1.predict(x_train)
# Calculate the metrics
accuracy1_train = accuracy_score(y_train, y_pred1_train)
precision1_train = precision_score(y_train, y_pred1_train)
recall1_train = recall_score(y_train, y_pred1_train)

accuracy1 = accuracy_score(y_test, y_pred1)
precision1 = precision_score(y_test, y_pred1)
recall1 = recall_score(y_test, y_pred1)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test, y_pred1), classes=['Non Fraud', 'Fraud'],
                      title='Logistic Regression Confusion matrix')
plt.show()

In [None]:

# Define the SVC model
model2 = SVC(kernel='linear', random_state=42)
# Fit the model on the training data
model2.fit(x_train, y_train)
# Predict on the testing data
y_pred2 = model2.predict(x_test)
# Predict on the training data
y_pred2_train = model2.predict(x_train)
# Calculate the metrics
accuracy2_train = accuracy_score(y_train, y_pred2_train)
precision2_train = precision_score(y_train, y_pred2_train)
recall2_train = recall_score(y_train, y_pred2_train)

accuracy2 = accuracy_score(y_test, y_pred2)
precision2 = precision_score(y_test, y_pred2)
recall2 = recall_score(y_test, y_pred2)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test, y_pred2), classes=['Non Fraud', 'Fraud'],
                      title='SVC Confusion matrix')
plt.show()

In [None]:

# Define the Random Forest model
model3 = RandomForestClassifier(n_estimators=3, random_state=0)
# Fit the model on the training data
model3.fit(x_train, y_train)
# Predict on the testing data
y_pred3 = model3.predict(x_test)
# Predict on the training data
y_pred3_train = model3.predict(x_train)
# Calculate the metrics
accuracy3_train = accuracy_score(y_train, y_pred3_train)
precision3_train = precision_score(y_train, y_pred3_train)
recall3_train = recall_score(y_train, y_pred3_train)

accuracy3 = accuracy_score(y_test, y_pred3)
precision3 = precision_score(y_test, y_pred3)
recall3 = recall_score(y_test, y_pred3)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test, y_pred3), classes=['Non Fraud', 'Fraud'],
                      title='Random Forest Confusion matrix')
plt.show()

In [None]:

# Define the KNN model
model4 = KNeighborsClassifier(n_neighbors=3)
# Fit the model on the training data
model4.fit(x_train, y_train)
# Predict on the testing data
y_pred4 = model4.predict(x_test)
# Predict on the training data
y_pred4_train = model4.predict(x_train)
# Calculate the metrics
accuracy4_train = accuracy_score(y_train, y_pred4_train)
precision4_train = precision_score(y_train, y_pred4_train)
recall4_train = recall_score(y_train, y_pred4_train)

accuracy4 = accuracy_score(y_test, y_pred4)
precision4 = precision_score(y_test, y_pred4)
recall4 = recall_score(y_test, y_pred4)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test, y_pred4), classes=['Non Fraud', 'Fraud'],
                      title='KNN Confusion matrix')
plt.show()

In [None]:

# Define the GaussianNB model
model5 = GaussianNB()
# Fit the model on the training data
model5.fit(x_train, y_train)
# Predict on the testing data
y_pred5 = model5.predict(x_test)
# Predict on the training data
y_pred5_train = model5.predict(x_train)
# Calculate the metrics
accuracy5_train = accuracy_score(y_train, y_pred5_train)
precision5_train = precision_score(y_train, y_pred5_train)
recall5_train = recall_score(y_train, y_pred5_train)

accuracy5 = accuracy_score(y_test, y_pred5)
precision5 = precision_score(y_test, y_pred5)
recall5 = recall_score(y_test, y_pred5)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test, y_pred5), classes=['Non Fraud', 'Fraud'],
                      title='GaussianNB Confusion matrix')
plt.show()

In [None]:
# Define a list of results for each model
results = [[accuracy1, precision1, recall1],
           [accuracy2, precision2, recall2],
           [accuracy3, precision3, recall3],
           [accuracy4, precision4, recall4],
           [accuracy5, precision5, recall5]]

results_training = [[accuracy1_train, precision1_train, recall1_train],
                    [accuracy2_train, precision2_train, recall2_train],
                    [accuracy3_train, precision3_train, recall3_train],
                    [accuracy4_train, precision4_train, recall4_train],
                    [accuracy5_train, precision5_train, recall5_train]]

results = [[round(x, 3) for x in row] for row in results]
results_training = [[round(x, 3) for x in row] for row in results_training]

In [None]:
# Define a list of metrics to evaluate
metrics = ["Accuracy (Training)", "Accuracy (Testing)", "Precision (Training)", "Precision (Testing)",
           "Recall (Training)", "Recall (Testing)"]

# Concatenate the results and results_training lists
results_combined = np.concatenate((results_training, results), axis=1)

# Define a list of model names
model_names = ["Logistic Regression", "SVC", "Random Forest", "KNN", "GaussianNB"]

# Convert the results to a numpy array
results_combined = np.array(results_combined)

# Plot the table of results
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('off')

# Create the table with the font properties and disable auto font size
table = ax.table(cellText=results_combined, rowLabels=model_names, colLabels=metrics, cellLoc='center', loc='center',
                 cellColours=plt.cm.RdYlGn(results_combined), colColours=["lightgray"] * len(metrics),
                 bbox=[0, 0, 1, 1])
table.set_fontsize(12)
table.scale(1, 2)
plt.tight_layout()
plt.title("Model Comparison")
plt.show()


### Take 2

In [None]:
# Define a list of features that are highly correlated with the class
high_corr_features = ['V17', 'V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V11', 'V4', 'V2']

# Select only the high correlation features from the data
x_high_corr = new_data[high_corr_features]
y_high_corr = new_data['Class']

# Split the data into training and testing data
x_train_high_corr, x_test_high_corr, y_train_high_corr, y_test_high_corr = train_test_split(x_high_corr, y_high_corr,
                                                                                            test_size=0.2,
                                                                                            stratify=y_high_corr,
                                                                                            random_state=2)

In [None]:
# Define the KNN model with high correlation features
model6 = KNeighborsClassifier(n_neighbors=3)
# Fit the model on the training data
model6.fit(x_train_high_corr, y_train_high_corr)
# Predict on the testing data
y_pred6 = model6.predict(x_test_high_corr)
# Predict on the training data
y_pred6_train = model6.predict(x_train_high_corr)
# Calculate the metrics
accuracy6_train = accuracy_score(y_train_high_corr, y_pred6_train)
precision6_train = precision_score(y_train_high_corr, y_pred6_train)
recall6_train = recall_score(y_train_high_corr, y_pred6_train)

accuracy6 = accuracy_score(y_test_high_corr, y_pred6)
precision6 = precision_score(y_test_high_corr, y_pred6)
recall6 = recall_score(y_test_high_corr, y_pred6)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test_high_corr, y_pred6), classes=['Non Fraud', 'Fraud'],
                      title='KNN High Correlation Confusion matrix')
plt.show()

In [None]:
# Define the SVC model
model7 = SVC(kernel='linear', random_state=42)
# Fit the model on the training data
model7.fit(x_train_high_corr, y_train_high_corr)
# Predict on the testing data
y_pred7 = model7.predict(x_test_high_corr)
# Predict on the training data
y_pred7_train = model7.predict(x_train_high_corr)
# Calculate the metrics
accuracy7_train = accuracy_score(y_train_high_corr, y_pred7_train)
precision7_train = precision_score(y_train_high_corr, y_pred7_train)
recall7_train = recall_score(y_train_high_corr, y_pred7_train)

accuracy7 = accuracy_score(y_test_high_corr, y_pred7)
precision7 = precision_score(y_test_high_corr, y_pred7)
recall7 = recall_score(y_test_high_corr, y_pred7)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test_high_corr, y_pred7), classes=['Non Fraud', 'Fraud'],
                      title='SVC High Correlation Confusion matrix')
plt.show()

In [None]:
# Define the Random Forest model
model8 = RandomForestClassifier(n_estimators=3, random_state=0)
# Fit the model on the training data
model8.fit(x_train_high_corr, y_train_high_corr)
# Predict on the testing data
y_pred8 = model8.predict(x_test_high_corr)
# Predict on the training data
y_pred8_train = model8.predict(x_train_high_corr)
# Calculate the metrics
accuracy8_train = accuracy_score(y_train_high_corr, y_pred8_train)
precision8_train = precision_score(y_train_high_corr, y_pred8_train)
recall8_train = recall_score(y_train_high_corr, y_pred8_train)

accuracy8 = accuracy_score(y_test_high_corr, y_pred8)
precision8 = precision_score(y_test_high_corr, y_pred8)
recall8 = recall_score(y_test_high_corr, y_pred8)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test_high_corr, y_pred8), classes=['Non Fraud', 'Fraud'],
                      title='Random Forest High Correlation Confusion matrix')
plt.show()

In [None]:
# Define the Logistic Regression model
model9 = LogisticRegression()
# Fit the model on the training data
model9.fit(x_train_high_corr, y_train_high_corr)
# Predict on the testing data
y_pred9 = model9.predict(x_test_high_corr)
# Predict on the training data
y_pred9_train = model9.predict(x_train_high_corr)
# Calculate the metrics
accuracy9_train = accuracy_score(y_train_high_corr, y_pred9_train)
precision9_train = precision_score(y_train_high_corr, y_pred9_train)
recall9_train = recall_score(y_train_high_corr, y_pred9_train)

accuracy9 = accuracy_score(y_test_high_corr, y_pred9)
precision9 = precision_score(y_test_high_corr, y_pred9)
recall9 = recall_score(y_test_high_corr, y_pred9)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test_high_corr, y_pred9), classes=['Non Fraud', 'Fraud'],
                      title='Logistic Regression High Correlation Confusion matrix')
plt.show()

In [None]:
# Define the GaussianNB model
model10 = GaussianNB()
# Fit the model on the training data
model10.fit(x_train_high_corr, y_train_high_corr)
# Predict on the testing data
y_pred10 = model10.predict(x_test_high_corr)
# Predict on the training data
y_pred10_train = model10.predict(x_train_high_corr)
# Calculate the metrics
accuracy10_train = accuracy_score(y_train_high_corr, y_pred10_train)
precision10_train = precision_score(y_train_high_corr, y_pred10_train)
recall10_train = recall_score(y_train_high_corr, y_pred10_train)

accuracy10 = accuracy_score(y_test_high_corr, y_pred10)
precision10 = precision_score(y_test_high_corr, y_pred10)
recall10 = recall_score(y_test_high_corr, y_pred10)
# Plot the confusion matrix
plot_confusion_matrix(confusion_matrix(y_test_high_corr, y_pred10), classes=['Non Fraud', 'Fraud'],
                      title='GaussianNB High Correlation Confusion matrix')
plt.show()

In [None]:
results_high_corr = [[accuracy9, precision9, recall9],
                     [accuracy7, precision7, recall7],
                     [accuracy8, precision8, recall8],
                     [accuracy6, precision6, recall6],
                     [accuracy10, precision10, recall10]]

results_training = [[accuracy9_train, precision9_train, recall9_train],
                    [accuracy7_train, precision7_train, recall7_train],
                    [accuracy8_train, precision8_train, recall8_train],
                    [accuracy6_train, precision6_train, recall6_train],
                    [accuracy10_train, precision10_train, recall10_train]]

results_high_corr = [[round(x, 3) for x in row] for row in results_high_corr]
results_training = [[round(x, 3) for x in row] for row in results_training]

In [None]:
# Concatenate the results and results_training lists
results_combined = np.concatenate((results_training, results_high_corr), axis=1)

# Define a list of model names

# Convert the results to a numpy array
results_combined = np.array(results_combined)

# Plot the table of results
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('off')  # Masquer les axes

table = ax.table(cellText=results_combined, rowLabels=model_names, colLabels=metrics, cellLoc='center', loc='center',
                 cellColours=plt.cm.RdYlGn(results_combined), colColours=["lightgray"] * len(metrics),
                 bbox=[0, 0, 1, 1])
table.set_fontsize(12)
table.scale(1, 2)
plt.tight_layout()
plt.title("High Correlation Model Comparison")
plt.show()