In [None]:
# Import

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from pathlib import Path
from IPython.display import display
from sqlalchemy import create_engine, MetaData, select
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from config import database_params

In [None]:
# Data Path

learning_data = Path('../Data_2/creditcard_2023.csv')

learning_df = pd.read_csv(learning_data, encoding='utf-8')

learning_df.head()

In [None]:
# Check data types

learning_df.dtypes

In [None]:
# Row count

num_rows = len(learning_df)
print("Number of rows:", num_rows)

In [None]:
# Cleaning

# Convert non-float values to NaN
learning_df = learning_df.apply(pd.to_numeric, errors='coerce')

# Drop rows containing NaN values
cleaned_learning_df = learning_df.dropna()

# Display the resulting DataFrame
print(cleaned_learning_df.head())

In [None]:
# Row count

num_rows = len(cleaned_learning_df)
print("Number of rows:", num_rows)

In [None]:
# Export cleaned data

cleaned_learning_df.to_csv('../Data_2/cleaned_creditcard_2023.csv', encoding="utf-8", index=False, header=True)

In [None]:
# Creating SQL database

connection_params = database_params

connection = psycopg2.connect(**connection_params)

connection.autocommit = True

# Create a cursor object
cursor = connection.cursor()

# Create the database
cursor.execute("CREATE DATABASE creditcardtransactions")

# Commit the changes and close the connection to the default database
connection.commit()
cursor.close()
connection.close()

In [None]:
# Create the table

connection_params = { **database_params,
                     'dbname' : 'creditcardtransactions'}

connection = psycopg2.connect(**connection_params)

connection.autocommit = True

# Create a cursor object
cursor = connection.cursor()

# Create the table
create_credit_card_table = """
DROP TABLE IF EXISTS CreditCardTransactions;
CREATE TABLE CreditCardTransactions (
    id INT,
    V1 NUMERIC,
	V2 NUMERIC,
	V3 NUMERIC,
	V4 NUMERIC,
	V5 NUMERIC,
	V6 NUMERIC,
	V7 NUMERIC,
	V8 NUMERIC,
	V9 NUMERIC,
	V10 NUMERIC,
	V11 NUMERIC,
	V12 NUMERIC,
	V13 NUMERIC,
	V14 NUMERIC,
	V15 NUMERIC,
	V16 NUMERIC,
	V17 NUMERIC,
	V18 NUMERIC,
	V19 NUMERIC,
	V20 NUMERIC,
	V21 NUMERIC,
	V22 NUMERIC,
	V23 NUMERIC,
	V24 NUMERIC,
	V25 NUMERIC,
	V26 NUMERIC,
	V27 NUMERIC,
	V28 NUMERIC,
	Amount Numeric,
	class INT
);
"""

cursor.execute(create_credit_card_table)

# Commit the changes and close the connection
cursor.close()
connection.close()

In [None]:
# Check the distribution of the target variable (class)
sns.countplot(x = 'class', data = cleaned_learning_df)
plt.title('Prediction of Fraudulant Transactions')
plt.show()

# Split the data into features (X) and target variable (y)
X = cleaned_learning_df.drop('class', axis = 1)
y = cleaned_learning_df['class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

# Build a logistic regression model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Create a Scatter plot for different perspective

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Scatter plot of the two principal components
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm', alpha=0.6)
plt.title('Scatter plot of Credit Card Transactions (2D PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Class', loc='upper right')
plt.show()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Model 1 - Report Explained
Precision: Precision measures the accuracy of the positive predictions made by the model. For class 0, it indicates the proportion of correctly predicted instances among all instances predicted as class 0. Similarly, for class 1, it represents the proportion of correctly predicted instances among all instances predicted as class 1. In this report, both classes have a precision of 1.00, indicating that all positive predictions made by the model were correct.

Recall: Recall, also known as sensitivity, measures the ability of the model to capture all positive instances of the class. For class 0, it represents the proportion of correctly predicted instances of class 0 among all actual instances of class 0. Similarly, for class 1, it indicates the proportion of correctly predicted instances of class 1 among all actual instances of class 1. Like precision, both classes have a recall of 1.00, indicating that the model correctly identified all instances of both classes.

F1-score: The F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall and is particularly useful when the classes are imbalanced. Like precision and recall, the F1-score ranges from 0 to 1, with higher values indicating better performance. In this report, both classes have an F1-score of 1.00, indicating perfect balance between precision and recall.

Support: Support represents the number of actual occurrences of each class in the dataset. For class 0, there are 56,734 instances, and for class 1, there are 56,992 instances.

Accuracy: Accuracy measures the overall correctness of the model's predictions across all classes. In this case, the model achieved an accuracy of 1.00, indicating that all predictions, both positive and negative, were correct.

Additionally, the confusion matrix provided at the end of the report summarizes the model's predictions. In this case, the model made 56,669 correct predictions for class 0 and 56,870 correct predictions for class 1. It incorrectly classified 65 instances of class 0 as class 1 and 122 instances of class 1 as class 0.

In [None]:
# Number of classes in the training set before random oversampling
y_train.value_counts()

Decision Tree model

In [None]:
# Separating Features and Target Variable
X = cleaned_learning_df.drop('class', axis = 1)
y = cleaned_learning_df['class']

# Splitting Data into Training and Test Sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Feature Scaling
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Building and Training the Decision Tree Classifier
dt=DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled,y_train)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [None]:
# Making Predictions 
y_test_pred=dt.predict(X_test_scaled)
y_train_pred=dt.predict(X_train_scaled)

In [None]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred_test = dt.predict(X_test_scaled)

# Predict on the training set (optional, for comparison)
y_pred_train = dt.predict(X_train_scaled)

# Calculate accuracy for test set
accuracy_test = accuracy_score(y_test, y_pred_test)

# Calculate accuracy for training set (optional, for comparison)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Accuracy on Test Set:", accuracy_test)
print("Accuracy on Training Set:", accuracy_train)

In [None]:
import joblib

# Save the trained model to a file
joblib.dump(dt, 'decision_tree_model.h5')

Logistic Regression

# Check the distribution of the target variable (class)
sns.countplot(x='class', data=cleaned_learning_df)
plt.title('Logistic Regression Prediction of Fraudulent Transactions')
plt.show()

# Split the data - features (X) and target variable (y)
X = cleaned_learning_df.drop('class', axis=1)
y = cleaned_learning_df['class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = model.score(X_test, y_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

# Create a Scatter plot for different perspective

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Scatter plot of the two principal components
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm', alpha=0.6)
plt.title('Logistic Regression Scatter plot of Credit Card Transactions (2D PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Class', loc='upper right')
plt.show()

# Re-evaluate the model and print the accuracy and classification report
print(f"Logistic Regression Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Report
Precision: Precision measures the accuracy of the positive predictions made by the model. For class 0, it indicates the proportion of correctly predicted instances among all instances predicted as class 0. Similarly, for class 1, it represents the proportion of correctly predicted instances among all instances predicted as class 1. In this report, both classes have a precision of 1.00, indicating that all positive predictions made by the model were correct.

Recall: Recall, also known as sensitivity, measures the ability of the model to capture all positive instances of the class. For class 0, it represents the proportion of correctly predicted instances of class 0 among all actual instances of class 0. Similarly, for class 1, it indicates the proportion of correctly predicted instances of class 1 among all actual instances of class 1. Like precision, both classes have a recall of 1.00, indicating that the model correctly identified all instances of both classes.

F1-score: The F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall and is particularly useful when the classes are imbalanced. Like precision and recall, the F1-score ranges from 0 to 1, with higher values indicating better performance. In this report, both classes have an F1-score of 1.00, indicating perfect balance between precision and recall.

Support: Support represents the number of actual occurrences of each class in the dataset. For class 0, there are 56,734 instances, and for class 1, there are 56,992 instances.

Accuracy: Accuracy measures the overall correctness of the model's predictions across all classes. In this case, the model achieved an accuracy of 1.00, indicating that all predictions, both positive and negative, were correct.

Additionally, the confusion matrix provided at the end of the report summarizes the model's predictions. In this case, the model made 56,669 correct predictions for class 0 and 56,870 correct predictions for class 1. It incorrectly classified 65 instances of class 0 as class 1 and 122 instances of class 1 as class 0.

In [None]:
# Check the distribution of the target variable (class)
sns.countplot(x = 'class', data = cleaned_learning_df)
plt.title('Prediction of Fraudulent Transactions')
plt.show()

# Split the data into features (X) and target variable (y)
X = cleaned_learning_df.drop('class', axis=1)
y = cleaned_learning_df['class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialise lists to store optimisation results
accuracy_list = []
conf_matrix_list = []

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Perform model optimization
for max_iter in [100, 500, 1000]:
    
    # Build a logistic regression model
    model = LogisticRegression(max_iter=max_iter)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = model.score(X_test, y_test)
    accuracy_list.append(accuracy)
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_list.append(conf_matrix)

    print(f"Max Iterations: {max_iter}")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}\n")

# Display overall model performance
best_max_iter = [100, 500, 1000][accuracy_list.index(max(accuracy_list))]
print(f"Best Max Iterations: {best_max_iter}")
print("Overall Model Performance:")
print(classification_report(y_test, model.predict(X_test)))
print(f"Overall Accuracy: {max(accuracy_list)}")
print(f"Overall Confusion Matrix:\n{conf_matrix_list[accuracy_list.index(max(accuracy_list))]}")

# Plot confusion matrix of the best performing model
plt.figure()
sns.heatmap(conf_matrix_list[accuracy_list.index(max(accuracy_list))], annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Best Performing Model)')
plt.show()

In [None]:
# Check the distribution of the target variable (class)
sns.countplot(x = 'class', data = cleaned_learning_df)
plt.title('Prediction of Fraudulent Transactions')
plt.show()

In [None]:
# Split the data into features (X) and target variable (y)
X = cleaned_learning_df.drop('class', axis=1)
y = cleaned_learning_df['class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialise lists to store optimisation results
accuracy_list = []
conf_matrix_list = []

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Perform model optimization
for max_iter in [100, 500, 1000]:
    
    # Build a logistic regression model
    model = LogisticRegression(max_iter=max_iter)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = model.score(X_test, y_test)
    accuracy_list.append(accuracy)
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_list.append(conf_matrix)

    print(f"Max Iterations: {max_iter}")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}\n")

In [None]:
# Display overall model performance
best_max_iter = [100, 500, 1000][accuracy_list.index(max(accuracy_list))]
print(f"Best Max Iterations: {best_max_iter}")
print("Overall Model Performance:")
print(classification_report(y_test, model.predict(X_test)))
print(f"Overall Accuracy: {max(accuracy_list)}")
print(f"Overall Confusion Matrix:\n{conf_matrix_list[accuracy_list.index(max(accuracy_list))]}")

# Plot confusion matrix of the best performing model
plt.figure()
sns.heatmap(conf_matrix_list[accuracy_list.index(max(accuracy_list))], annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Best Performing Model)')
plt.show()