In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, precision_recall_curve, average_precision_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight

# Load and preprocess the dataset
dataset3 = pd.read_csv('dataset3- Paysim/PS_20174392719_1491204439457_log.csv')
dataset3.drop_duplicates(inplace=True)

# Ensure numerical columns are of numeric type
numeric_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
for col in numeric_columns:
    dataset3[col] = pd.to_numeric(dataset3[col], errors='coerce')

# Drop rows where 'isFraud' is NaN
dataset3 = dataset3.dropna(subset=['isFraud'])

# Convert categorical column 'type' into dummy/indicator variables
if 'type' in dataset3.columns:
    dataset3 = pd.get_dummies(dataset3, columns=['type'], drop_first=True)

# Create new features
dataset3['balanceDiff'] = dataset3['oldbalanceOrg'] - dataset3['newbalanceOrig']
dataset3['balanceRatio'] = dataset3['newbalanceOrig'] / (dataset3['oldbalanceOrg'] + 1e-9)
dataset3['avgAmountOrig'] = dataset3.groupby('nameOrig')['amount'].transform('mean')
dataset3['avgAmountDest'] = dataset3.groupby('nameDest')['amount'].transform('mean')

# Fill any remaining NaN values
dataset3.fillna(0, inplace=True)

# Define features (X) and target (y)
X = dataset3.drop(columns=['isFraud', 'nameOrig', 'nameDest'])
y = dataset3['isFraud'].astype(int)

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance on training data
smote = SMOTE(sampling_strategy=0.4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_full_scaled, y_train_full)

# Ensure y_train_resampled is a numpy array of int type
y_train_resampled = np.array(y_train_resampled).astype(int)

# Reshape data for CNN
new_shape = (3, 5, 1)  # Adjust based on the number of features
X_train_resampled_cnn = X_train_resampled.reshape(-1, *new_shape)
X_test_cnn = X_test_scaled.reshape(-1, *new_shape)

# Compute class weights for the full resampled training data
class_weights_values = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_resampled),
    y=y_train_resampled
)
class_weights = dict(zip(np.unique(y_train_resampled), class_weights_values))

# Define the custom callback for dynamic thresholding
class DynamicThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, y_val):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_proba = self.model.predict(self.X_val).flatten()
        precision, recall, thresholds = precision_recall_curve(self.y_val, y_proba)
        f1_scores = 2 * precision * recall / (precision + recall + 1e-9)
        best_idx = np.argmax(f1_scores)
        best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
        y_pred = (y_proba >= best_threshold).astype(int)
        print(f"\nEpoch {epoch + 1} - Best Threshold: {best_threshold:.4f}")
        print(classification_report(self.y_val, y_pred))

# Implement early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Define CNN model with Input layer to fix the UserWarning
model_cnn = Sequential([
    Input(shape=new_shape),
    Conv2D(32, (2, 2), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model with class weights
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model with the custom callback and early stopping
model_cnn.fit(
    X_train_resampled_cnn, y_train_resampled,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_cnn, y_test),
    class_weight=class_weights,
    callbacks=[DynamicThresholdCallback(X_test_cnn, y_test), early_stopping]
)

# Train XGBoost model with adjusted scale_pos_weight
scale_pos_weight = class_weights[0] / class_weights[1]

clf_xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.2,
    subsample=0.75,
    colsample_bytree=0.75,
    random_state=42,
    scale_pos_weight=scale_pos_weight  # Adjust scale_pos_weight
)
clf_xgb.fit(X_train_resampled, y_train_resampled)

# Generate meta-features using cross-validation
meta_train = np.zeros((X_train_resampled.shape[0], 2))
meta_test = np.zeros((X_test_scaled.shape[0], 2))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
    print(f"Training fold {fold + 1}...")
    X_tr, X_val = X_train_resampled[train_idx], X_train_resampled[val_idx]
    y_tr, y_val = y_train_resampled[train_idx], y_train_resampled[val_idx]

    # Ensure y_tr and y_val are numpy arrays of int type
    y_tr = np.array(y_tr).astype(int)
    y_val = np.array(y_val).astype(int)

    # Check unique classes in y_tr
    unique_classes = np.unique(y_tr)
    print(f"Unique classes in y_tr: {unique_classes}")

    if len(unique_classes) < 2:
        print(f"Only one class present in y_tr for fold {fold + 1}. Skipping this fold.")
        continue

    # Compute class weights for the current fold
    class_weights_fold_values = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=unique_classes,
        y=y_tr
    )
    class_weights_fold = dict(zip(unique_classes, class_weights_fold_values))

    # Adjust scale_pos_weight for XGBoost
    scale_pos_weight_fold = class_weights_fold[0] / class_weights_fold[1]

    # Reshape data for CNN
    X_tr_cnn = X_tr.reshape(-1, *new_shape)
    X_val_cnn = X_val.reshape(-1, *new_shape)

    # Train base models
    # CNN model
    model_cnn_fold = tf.keras.models.clone_model(model_cnn)
    model_cnn_fold.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_cnn_fold.fit(
        X_tr_cnn, y_tr,
        epochs=10,
        batch_size=32,
        class_weight=class_weights_fold,
        verbose=0
    )

    # XGBoost model
    clf_xgb_fold = XGBClassifier(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.2,
        subsample=0.75,
        colsample_bytree=0.75,
        random_state=42,
        scale_pos_weight=scale_pos_weight_fold
    )
    clf_xgb_fold.fit(X_tr, y_tr)

    # Predict on validation fold
    cnn_val_preds = model_cnn_fold.predict(X_val_cnn).flatten()
    xgb_val_preds = clf_xgb_fold.predict_proba(X_val)[:, 1]

    # Store predictions as meta-features
    meta_train[val_idx, 0] = cnn_val_preds
    meta_train[val_idx, 1] = xgb_val_preds

    # Predict on test set and average
    cnn_test_preds = model_cnn_fold.predict(X_test_cnn).flatten()
    xgb_test_preds = clf_xgb_fold.predict_proba(X_test_scaled)[:, 1]
    meta_test[:, 0] += cnn_test_preds / skf.n_splits
    meta_test[:, 1] += xgb_test_preds / skf.n_splits

# Ensure meta_train and meta_test are properly populated
if np.any(np.isnan(meta_train)) or np.any(np.isnan(meta_test)):
    print("NaN values found in meta features. Please check the cross-validation loop.")
else:
    # Train meta-model
    meta_model = LogisticRegression(max_iter=10000, random_state=42)
    meta_model.fit(meta_train, y_train_resampled)

    # Evaluate the ensemble model
    ensemble_proba = meta_model.predict_proba(meta_test)[:, 1]

    # Find optimal threshold for ensemble model
    precision, recall, thresholds = precision_recall_curve(y_test, ensemble_proba)
    f1_scores = 2 * precision * recall / (precision + recall + 1e-9)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    ensemble_preds = (ensemble_proba >= best_threshold).astype(int)

    print(f"\nEnsemble Model - Best Threshold: {best_threshold:.4f}")
    print(classification_report(y_test, ensemble_preds))

    # Compute Average Precision Score
    average_precision = average_precision_score(y_test, ensemble_proba)
    print(f'Average Precision-Recall Score: {average_precision:.4f}')


Epoch 1/50
[1m49708/49708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 979us/step

Epoch 1 - Best Threshold: 0.9979
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1588602
           1       0.84      0.62      0.71      2053

    accuracy                           1.00   1590655
   macro avg       0.92      0.81      0.86   1590655
weighted avg       1.00      1.00      1.00   1590655

[1m208504/208504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1064s[0m 5ms/step - accuracy: 0.9522 - loss: 0.1148 - val_accuracy: 0.9678 - val_loss: 0.0645
Epoch 2/50
[1m49708/49708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 731us/step

Epoch 2 - Best Threshold: 0.9956
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1588602
           1       0.88      0.66      0.76      2053

    accuracy                           1.00   1590655
   macro avg       0.94      0.83      0.

: 