# Load Data

In [5]:
import pandas as pd
import os

def load_data(file_name, data_type = None):
    """
    Reads a CSV file from the 'data' directory into a pandas DataFrame.

    Args:
        file_name (str): The name of the CSV file to load.

    Returns:
        pandas.DataFrame: The loaded data.
    """
    # Construct the relative path from the notebook to the data folder
    # Notebook is in 'tests/notebooks/', so we go up two levels
    base_path = os.path.abspath(os.path.join(os.path.dirname(file_name), '..', '..'))
    if data_type == 'synthetic_train':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    elif data_type == 'synthetic_test':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTest/', file_name)
    elif data_type == 'ieee_fraud_detection':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    else:
        raise ValueError('data_type must be either synthetic_train, synthetic_test or ieee_fraud_detection')

    if not os.path.exists(data_path):
        print(f"Error: File not found at {data_path}")
        return None

    return pd.read_csv(data_path)


In [6]:
synt_train_data = load_data('fraudTrain.zip', data_type = 'synthetic_train')

In [7]:
synt_train_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [38]:
synt_test_data = load_data('fraudTest.zip', data_type = 'synthetic_test')

In [39]:
synt_test_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,277859.0,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,555718.0,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [41]:
ieee_data = load_data('ieee_fraud_detection.zip/test_identity.csv', data_type = 'ieee_fraud_detection')


Error: File not found at C:\Users\Marco\PycharmProjects\fraud_detection_adversarial\tests\data/input_data/synth_data/fraudTrain/ieee_fraud_detection.zip/test_identity.csv


In [2]:

import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_describe_outputs(df):
    """
    Plots the main numerical outputs of the describe() method for a pandas DataFrame.

    Args:
        df: A pandas DataFrame.
    """
    describe_df = df.describe().transpose() # Transpose for easier plotting

    # Select columns that contain numerical describe statistics
    describe_numerical_cols = ['mean', 'std', 'min', 'max', '50%'] # Add or remove as needed

    # Filter for columns that exist in the describe output
    describe_numerical_cols = [col for col in describe_numerical_cols if col in describe_df.columns]

    if not describe_numerical_cols:
        print("No relevant numerical statistics found to plot from df.describe().")
        return

    # Create subplots
    num_plots = len(describe_numerical_cols)
    fig, axes = plt.subplots(nrows=num_plots, ncols=1, figsize=(10, 5 * num_plots))
    fig.tight_layout(pad=4.0) # Adjust spacing

    if num_plots == 1: # Handle the case of a single plot
        axes = [axes]

    for i, col in enumerate(describe_numerical_cols):
        sns.barplot(x=describe_df.index, y=describe_df[col], ax=axes[i])
        axes[i].set_title(f'{col.capitalize()} of Numerical Columns')
        axes[i].set_ylabel(col.capitalize())
        axes[i].set_xlabel('Numerical Columns')
        axes[i].tick_params(axis='x', rotation=45, right= True)

    plt.show()

# Training

In [None]:
# prompt: Help me Implement a State-of-the-Art Defender Model: The first objective is to build a high-performance "defender" system that represents a realistic target for an adversary. This will involve implementing a powerful classification model, such as a stacking ensemble of gradient boosting machines (e.g., XGBoost and LightGBM), which are known for their strong performance on tabular data. This model will serve as the "blue team" in the simulation. Based on the ieee_raw_train and test transaction and idendity datasets

# !pip install xgboost lightgbm

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

def build_stacking_ensemble_defender(df_train_trans, df_train_id, df_test_trans, df_test_id):
    """
    Builds and trains a stacking ensemble defender model using XGBoost and LightGBM.

    Args:
        df_train_trans: Training transaction DataFrame.
        df_train_id: Training identity DataFrame.
        df_test_trans: Testing transaction DataFrame.
        df_test_id: Testing identity DataFrame.

    Returns:
        The trained StackingClassifier model.
    """
    # Merge the identity and transaction datasets
    df_train = pd.merge(df_train_trans, df_train_id, on='TransactionID', how='left')
    df_test = pd.merge(df_test_trans, df_test_id, on='TransactionID', how='left')

    # Target variable
    y_train = df_train['isFraud']

    # Drop target and TransactionID from features
    X_train = df_train.drop(['isFraud', 'TransactionID'], axis=1)
    X_test = df_test.drop(['TransactionID'], axis=1)

    # Align columns after merge and before training
    # This is important because the identity data might introduce new columns
    train_cols = X_train.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    missing_in_train = set(test_cols) - set(train_cols)
    for c in missing_in_train:
        X_train[c] = 0
    X_test = X_test[train_cols] # Ensure the order of columns is the same

    # Handle categorical features: Convert to 'category' dtype for LightGBM and one-hot encode for XGBoost/general.
    # For simplicity here, we'll use label encoding or let models handle directly where possible.
    # A more robust approach would involve more careful categorical feature handling.
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            # Simple label encoding for demonstration. Robust approach uses OneHotEncoding
            # or target encoding depending on the cardinality and model.
            # XGBoost and LightGBM can handle integers directly.
            X_train[col] = X_train[col].astype('category').cat.codes
            X_test[col] = X_test[col].astype('category').cat.codes

    # Handle missing values: Simple imputation with median.
    # A more robust approach would use more advanced imputation techniques or models that handle NaNs.
    for col in X_train.columns:
        if X_train[col].isnull().any():
            median_val = X_train[col].median()
            X_train[col] = X_train[col].fillna(median_val)
            if col in X_test.columns:
                X_test[col] = X_test[col].fillna(median_val)

    # Define base models
    # Using default or simple parameters, tuning is crucial for performance
    estimators = [
        ('xgb', xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False, random_state=42)),
        ('lgbm', lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42)),
    ]

    # Define meta-model
    # Using Logistic Regression as a simple meta-model
    meta_model = LogisticRegression(solver='liblinear')

    # Define stacking classifier
    # Using StratifiedKFold for cross-validation in stacking
    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=meta_model,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        stack_method='predict_proba' # Use probabilities for the meta-model
    )

    print("Training the stacking ensemble model...")
    stacking_model.fit(X_train, y_train)
    print("Stacking ensemble model trained successfully.")

    return stacking_model, X_test # Return X_test as well for evaluation later
