In [1]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import tensorflow as tf

# Load the dataset
dataset1 = pd.read_csv('dataset1-MLG/creditcard.csv')
dataset1.drop_duplicates(inplace=True)

# Ensure 'Time' and 'Amount' columns are numeric
numeric_columns = ['Time', 'Amount']
for col in numeric_columns:
    dataset1[col] = pd.to_numeric(dataset1[col], errors='coerce')

# Drop rows with missing values
dataset1.dropna(inplace=True)

# Define features (X) and target (y)
X = dataset1.drop(columns=['Class'])
y = dataset1['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check the shape of the data
num_features = X_train_resampled.shape[1]
print(f"Number of features after SMOTE: {num_features}")

# Define the new shape for CNN input
new_shape = (5, 6, 1)  

# Reshape the data to fit the CNN input requirements
X_train_reshaped = X_train_resampled.reshape(-1, *new_shape)
X_test_reshaped = X_test_scaled.reshape(-1, *new_shape)

# Define a custom callback to test the best threshold
class BestThresholdCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        y_proba = self.model.predict(X_test_reshaped)
        threshold = 0.9  # Optimal threshold
        y_pred = (y_proba > threshold).astype(int)
        print(f"\nEpoch {epoch + 1} - Classification Report at Threshold {threshold}:")
        print(classification_report(y_test, y_pred))

# Define the CNN model
model = Sequential([
    Conv2D(32, (2, 2), activation='relu', input_shape=new_shape),
    MaxPooling2D((2, 2)), 
    Dropout(0.25),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # For binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with the custom callback
history = model.fit(
    X_train_reshaped, y_train_resampled,
    epochs=11,
    batch_size=32,
    validation_data=(X_test_reshaped, y_test),
    callbacks=[BestThresholdCallback()]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Final Test Accuracy: {accuracy:.4f}")


Number of features after SMOTE: 30
Epoch 1/11


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 591us/step

Epoch 1 - Classification Report at Threshold 0.9:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     70814
           1       0.76      0.80      0.78       118

    accuracy                           1.00     70932
   macro avg       0.88      0.90      0.89     70932
weighted avg       1.00      1.00      1.00     70932

[1m9959/9959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9452 - loss: 0.1534 - val_accuracy: 0.9945 - val_loss: 0.0361
Epoch 2/11
[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 577us/step

Epoch 2 - Classification Report at Threshold 0.9:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     70814
           1       0.68      0.80      0.73       118

    accuracy                           1.00     70932
   macro avg       0.84      0

: 