In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import tensorflow as tf

# Load the datasets
train_transactions = pd.read_csv('dataset2-ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('dataset2-ieee-fraud-detection/train_identity.csv')

# Merge the datasets on the index
train = train_transactions.merge(train_identity, how='left', left_index=True, right_index=True)

# Extract the target variable
y = train['isFraud'].astype('uint8')

# Check and remove duplicate rows
train.drop_duplicates(inplace=True)

# Convert 'TransactionDT' to datetime format 
train['TransactionDT'] = pd.to_datetime(train['TransactionDT'], unit='s', errors='coerce')

# Ensure all columns that should be numeric are converted
numeric_columns = train.select_dtypes(include=['object']).columns
for col in numeric_columns:
    train[col] = pd.to_numeric(train[col], errors='coerce')

# Drop rows where 'isFraud' is NaN
train = train.dropna(subset=['isFraud'])

# Drop unnecessary columns or those not needed for modeling
columns_to_drop = ['TransactionDT', 'ProductCD']
train.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Define features and target variable
X = train.drop(columns=['isFraud'])
y = train['isFraud']

# Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Handle missing values
X = X.fillna(-999)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Reshape the data to fit the CNN input requirements
num_features = X_train_resampled.shape[1]

new_shape = (16, 27, 1)


X_train_reshaped = X_train_resampled.reshape(-1, *new_shape)
X_test_reshaped = X_test_scaled.reshape(-1, *new_shape)

# Define a custom callback to test only the best threshold
class BestThresholdCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        y_proba = self.model.predict(X_test_reshaped)
        threshold = 0.9  # Optimal threshold
        y_pred = (y_proba > threshold).astype(int)
        print(f"\nEpoch {epoch + 1} - Classification Report at Threshold {threshold}:")
        print(classification_report(y_test, y_pred))

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=new_shape),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # For binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with the custom callback
history = model.fit(
    X_train_reshaped, y_train_resampled,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_reshaped, y_test),
    callbacks=[BestThresholdCallback()]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Final Test Accuracy: {accuracy:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4614/4614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step

Epoch 1 - Classification Report at Threshold 0.9:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98    142469
           1       0.77      0.14      0.24      5166

    accuracy                           0.97    147635
   macro avg       0.87      0.57      0.61    147635
weighted avg       0.96      0.97      0.96    147635

[1m18700/18700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 7ms/step - accuracy: 0.7782 - loss: 0.4802 - val_accuracy: 0.9332 - val_loss: 0.2533
Epoch 2/10
[1m4614/4614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step

Epoch 2 - Classification Report at Threshold 0.9:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98    142469
           1       0.80      0.13      0.23      5166

    accuracy                           0.97    147635
   macro avg      

: 