# Load Data

In [1]:
import pandas as pd
import os

def load_data(file_name, data_type = None):
    """
    Reads a CSV file from the 'data' directory into a pandas DataFrame.

    Args:
        file_name (str): The name of the CSV file to load.

    Returns:
        pandas.DataFrame: The loaded data.
    """
    # Construct the relative path from the notebook to the data folder
    # Notebook is in 'tests/notebooks/', so we go up two levels
    base_path = os.path.abspath(os.path.join(os.path.dirname(file_name), '..', '..'))
    if data_type == 'synthetic_train':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    elif data_type == 'synthetic_test':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTest/', file_name)
    elif data_type == 'ieee_fraud_detection':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    else:
        raise ValueError('data_type must be either synthetic_train, synthetic_test or ieee_fraud_detection')

    if not os.path.exists(data_path):
        print(f"Error: File not found at {data_path}")
        return None

    return pd.read_csv(data_path)


In [2]:
def load_ieee_data(file_name, data_type = None):
    """
    Reads a CSV file from the 'data' directory into a pandas DataFrame.

    Args:
        file_name (str): The name of the CSV file to load.

    Returns:
        pandas.DataFrame: The loaded data.
    """
    # Construct the relative path from the notebook to the data folder
    # Notebook is in 'tests/notebooks/', so we go up two levels
    base_path = os.path.abspath(os.path.join(os.path.dirname(file_name), '..', '..'))
    if data_type == 'synthetic_train':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    elif data_type == 'synthetic_test':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTest/', file_name)
    elif data_type == 'ieee_fraud_detection':
        data_path = os.path.join(base_path, 'data/input_data/synth_data/fraudTrain/', file_name)
    else:
        raise ValueError('data_type must be either synthetic_train, synthetic_test or ieee_fraud_detection')

    if not os.path.exists(data_path):
        print(f"Error: File not found at {data_path}")
        return None

    return pd.read_csv(data_path)

In [3]:
import pandas as pd
import zipfile
import os

def read_from_zip(file_name, zip_filename="ieee-fraud-detection.zip"):
    """
    Reads a specific CSV file from a zip archive located in the 'data/' directory.

    Args:
        file_name (str): The name of the CSV to load (e.g., 'train_transaction').
        zip_filename (str): The name of the zip archive.

    Returns:
        pandas.DataFrame: The loaded data, or None if the file is not found.
    """
    # Construct a relative path to the data directory.
    # This assumes your 'data' folder is at the project root.
    # From 'tests/notebooks/', we go up two levels to the project root.
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    zip_path = os.path.join(project_root, 'data/input_data/ieee_fraud_detection', zip_filename)

    csv_file_to_read = f"{file_name}.csv"

    if not os.path.exists(zip_path):
        print(f"Error: Zip file not found at '{zip_path}'")
        return None

    try:
        # Open the zip file
        with zipfile.ZipFile(zip_path) as z:
            # Open the specific CSV file within the zip
            with z.open(csv_file_to_read) as f:
                print(f"Reading '{csv_file_to_read}' from '{zip_filename}'...")
                # Read the dataset using pandas
                df = pd.read_csv(f)
                print("Data loaded successfully.")
                return df
    except KeyError:
        print(f"Error: File '{csv_file_to_read}' not found inside the zip archive.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [16]:
import pandas as pd
from sklearn.utils import shuffle as shuffle_data

def create_train_test_sets(
    train_transaction_df,
    train_identity_df,
    test_transaction_df,
    test_identity_df,
    shuffle=False,
    random_state=None
):
    """
    Merges transaction and identity data, creates train/test sets, and optionally shuffles the training data.

    Args:
        train_transaction_df (pd.DataFrame): The training transaction data.
        train_identity_df (pd.DataFrame): The training identity data.
        test_transaction_df (pd.DataFrame): The test transaction data.
        test_identity_df (pd.DataFrame): The test identity data.
        shuffle (bool): If True, shuffles the training data (X_train, y_train). Defaults to False.
        random_state (int): The seed for the random number generator used for shuffling.
                              Ensures reproducibility if shuffle is True.

    Returns:
        tuple: A tuple containing the final DataFrames: (X_train, y_train, X_test).
    """
    # Merge the training dataframes on TransactionID
    print("Merging training data...")
    # Using a left merge keeps all transactions, even if they don't have identity info.
    train_df = pd.merge(train_transaction_df, train_identity_df, on='TransactionID', how='left')
    print(f"Training data merged. Shape: {train_df.shape}")

    # Merge the test dataframes on TransactionID
    print("Merging test data...")
    test_df = pd.merge(test_transaction_df, test_identity_df, on='TransactionID', how='left')
    print(f"Test data merged. Shape: {test_df.shape}")

    # Create training features (X_train) and target (y_train)
    print("Creating training labels (y_train) and features (X_train)...")
    y_train = train_df['isFraud']
    # Drop the target variable to create the feature set
    X_train = train_df.drop('isFraud', axis=1)

    # The entire merged test dataframe is the test feature set
    X_test = test_df
    print("Feature and target sets created.")

    # Optionally shuffle the training data for randomization
    if shuffle:
        if random_state is not None:
            print(f"Shuffling X_train and y_train with random_state={random_state}...")
            X_train, y_train = shuffle_data(X_train, y_train, random_state=random_state)
            print("Shuffling complete.")
        else:
            # It's good practice to warn the user if the shuffle isn't reproducible
            print("Warning: Shuffle is True but no random_state was provided. Shuffling will not be reproducible.")
            X_train, y_train = shuffle_data(X_train, y_train)

    # Clean up to free memory
    del train_df, test_df, train_transaction_df, train_identity_df, test_transaction_df, test_identity_df

    return X_train, y_train, X_test

In [99]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_features(X_train, X_test):
    """
    Identifies and label encodes categorical features across training and test sets.

    Args:
        X_train (pd.DataFrame): The training feature set.
        X_test (pd.DataFrame): The test feature set.

    Returns:
        tuple: A tuple containing the encoded DataFrames: (X_train_encoded, X_test_encoded).
    """
    print("Encoding categorical features...")
    # Make copies to avoid modifying the original dataframes
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    # Identify categorical columns (those with 'object' dtype)
    categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
    print(f"Found {len(categorical_cols)} categorical columns to encode.")

    for col in categorical_cols:
        # Initialize a new LabelEncoder for each column
        le = LabelEncoder()

        # --- FIX STARTS HERE ---
        # Check if the column exists in the test set
        if col in X_test_encoded.columns:
            # Combine data from both train and test for a complete fitting
            combined_data = pd.concat([X_train_encoded[col], X_test_encoded[col]]).astype(str)
            le.fit(combined_data)

            # Transform both train and test data
            X_train_encoded[col] = le.transform(X_train_encoded[col].astype(str))
            X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))
        else:
            # If the column only exists in the training set
            print(f"Column '{col}' found in training set but not in test set. Encoding only for train.")
            le.fit(X_train_encoded[col].astype(str))
            X_train_encoded[col] = le.transform(X_train_encoded[col].astype(str))
        # --- FIX ENDS HERE ---

    print("Encoding complete.")
    return X_train_encoded, X_test_encoded



In [17]:
# print("Loading initial datasets...")
ieee_train_transaction_data = read_from_zip("train_transaction")
ieee_test_transaction_data = read_from_zip("test_transaction")
ieee_train_identity_data = read_from_zip("train_identity")
ieee_test_identity_data = read_from_zip("test_identity")
print("-" * 50)

Reading 'train_transaction.csv' from 'ieee-fraud-detection.zip'...
Data loaded successfully.
Reading 'test_transaction.csv' from 'ieee-fraud-detection.zip'...
Data loaded successfully.
Reading 'train_identity.csv' from 'ieee-fraud-detection.zip'...
Data loaded successfully.
Reading 'test_identity.csv' from 'ieee-fraud-detection.zip'...
Data loaded successfully.
--------------------------------------------------


# Transform

In [19]:
# 2. Check if all data loaded correctly before proceeding
if all(df is not None for df in [ieee_train_transaction_data, ieee_train_identity_data, ieee_test_transaction_data, ieee_test_identity_data]):

    # 3. Create the datasets with shuffling for model training
    print("Creating datasets with shuffling (random_state=42)...")
    X_train_shuffled, y_train_shuffled, X_test = create_train_test_sets(
        ieee_train_transaction_data,
        ieee_train_identity_data,
        ieee_test_transaction_data,
        ieee_test_identity_data,
        shuffle=True,
        random_state=42  # Using a fixed state ensures the shuffle is the same every time
    )

    print("\n--- Final Shapes ---")
    print(f"Shuffled X_train shape: {X_train_shuffled.shape}")
    print(f"Shuffled y_train shape: {y_train_shuffled.shape}")
    print(f"X_test shape: {X_test.shape}")

    print("\n--- Shuffled y_train Head ---")
    print(y_train_shuffled.head())
else:
    print("One or more dataframes failed to load. Aborting.")

Creating datasets with shuffling (random_state=42)...
Merging training data...
Training data merged. Shape: (590540, 434)
Merging test data...
Test data merged. Shape: (506691, 433)
Creating training labels (y_train) and features (X_train)...
Feature and target sets created.
Shuffling X_train and y_train with random_state=42...
Shuffling complete.

--- Final Shapes ---
Shuffled X_train shape: (590540, 433)
Shuffled y_train shape: (590540,)
X_test shape: (506691, 433)

--- Shuffled y_train Head ---
470624    0
565820    0
284083    0
239689    0
281855    0
Name: isFraud, dtype: int64


In [100]:
# This assumes you have already created X_train_shuffled, y_train_shuffled, and X_test


# 1. Apply the encoding function
X_train_final, X_test_final = encode_categorical_features(X_train_shuffled, X_test)

print("\n--- Data types after encoding (X_train_final) ---")
print(X_train_final.dtypes.value_counts())

print("\n--- Example of an encoded column ('P_emaildomain') ---")
print(X_train_final['P_emaildomain'].head())


Encoding categorical features...
Found 31 categorical columns to encode.
Column 'id_12' found in training set but not in test set. Encoding only for train.
Column 'id_15' found in training set but not in test set. Encoding only for train.
Column 'id_16' found in training set but not in test set. Encoding only for train.
Column 'id_23' found in training set but not in test set. Encoding only for train.
Column 'id_27' found in training set but not in test set. Encoding only for train.
Column 'id_28' found in training set but not in test set. Encoding only for train.
Column 'id_29' found in training set but not in test set. Encoding only for train.
Column 'id_30' found in training set but not in test set. Encoding only for train.
Column 'id_31' found in training set but not in test set. Encoding only for train.
Column 'id_33' found in training set but not in test set. Encoding only for train.
Column 'id_34' found in training set but not in test set. Encoding only for train.
Column 'id_35'

In [103]:
is_fraud_count = pd.Series(y_train_shuffled).where(lambda x:x == 1).count()

In [104]:
fraud_ratio = is_fraud_count/y_train_shuffled.shape[0]*100
print(f'The fraud ratio in y_train_shuffled: {round(fraud_ratio,5)}%')

The fraud ratio in y_train_shuffled: 3.499%


In [32]:
y_train_shuffled.info(verbose=True)

<class 'pandas.core.series.Series'>
Index: 590540 entries, 470624 to 121958
Series name: isFraud
Non-Null Count   Dtype
--------------   -----
590540 non-null  int64
dtypes: int64(1)
memory usage: 9.0 MB


In [6]:
synt_train_data = load_data('fraudTrain.zip', data_type = 'synthetic_train')

In [7]:
synt_train_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [38]:
synt_test_data = load_data('fraudTest.zip', data_type = 'synthetic_test')

In [39]:
synt_test_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,277859.0,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,555718.0,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [41]:
ieee_data = load_data('ieee_fraud_detection.zip/test_identity.csv', data_type = 'ieee_fraud_detection')


Error: File not found at C:\Users\Marco\PycharmProjects\fraud_detection_adversarial\tests\data/input_data/synth_data/fraudTrain/ieee_fraud_detection.zip/test_identity.csv


In [2]:

import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_describe_outputs(df):
    """
    Plots the main numerical outputs of the describe() method for a pandas DataFrame.

    Args:
        df: A pandas DataFrame.
    """
    describe_df = df.describe().transpose() # Transpose for easier plotting

    # Select columns that contain numerical describe statistics
    describe_numerical_cols = ['mean', 'std', 'min', 'max', '50%'] # Add or remove as needed

    # Filter for columns that exist in the describe output
    describe_numerical_cols = [col for col in describe_numerical_cols if col in describe_df.columns]

    if not describe_numerical_cols:
        print("No relevant numerical statistics found to plot from df.describe().")
        return

    # Create subplots
    num_plots = len(describe_numerical_cols)
    fig, axes = plt.subplots(nrows=num_plots, ncols=1, figsize=(10, 5 * num_plots))
    fig.tight_layout(pad=4.0) # Adjust spacing

    if num_plots == 1: # Handle the case of a single plot
        axes = [axes]

    for i, col in enumerate(describe_numerical_cols):
        sns.barplot(x=describe_df.index, y=describe_df[col], ax=axes[i])
        axes[i].set_title(f'{col.capitalize()} of Numerical Columns')
        axes[i].set_ylabel(col.capitalize())
        axes[i].set_xlabel('Numerical Columns')
        axes[i].tick_params(axis='x', rotation=45, right= True)

    plt.show()

In [87]:
X_train_shuffled.head(5)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
470624,3457624,12153579,724.0,W,7826,481.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
565820,3552820,15005886,108.5,W,12544,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,
284083,3271083,6970178,47.95,W,9400,111.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
239689,3226689,5673658,100.599,C,15885,545.0,185.0,visa,138.0,debit,...,,,,,,,,,,
281855,3268855,6886780,107.95,W,15497,490.0,150.0,visa,226.0,debit,...,,,,,,,,,,


In [57]:
print(X_train_shuffled['ProductCD'].unique())

['W' 'C' 'R' 'H' 'S']


In [89]:
X_train_shuffled.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 470624 to 121958
Data columns (total 433 columns):
 #    Column          Dtype  
---   ------          -----  
 0    TransactionID   int64  
 1    TransactionDT   int64  
 2    TransactionAmt  float64
 3    ProductCD       object 
 4    card1           int64  
 5    card2           float64
 6    card3           float64
 7    card4           object 
 8    card5           float64
 9    card6           object 
 10   addr1           float64
 11   addr2           float64
 12   dist1           float64
 13   dist2           float64
 14   P_emaildomain   object 
 15   R_emaildomain   object 
 16   C1              float64
 17   C2              float64
 18   C3              float64
 19   C4              float64
 20   C5              float64
 21   C6              float64
 22   C7              float64
 23   C8              float64
 24   C9              float64
 25   C10             float64
 26   C11             float64
 27   C12         

In [92]:
# columns_to_analyse = []
# for column in X_train_shuffled.columns:
#     if not isinstance(column, float):
        # columns_to_analyse.append(column)

columns_to_analyse = X_train_shuffled.select_dtypes(exclude=['float64'])
columns_to_analyse = [column for column in columns_to_analyse]

In [93]:
columns_to_analyse

['TransactionID',
 'TransactionDT',
 'ProductCD',
 'card1',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

In [94]:
dict_unique_values = {column: X_train_shuffled[column].unique() for column in columns_to_analyse}

In [95]:
dict_unique_values

{'TransactionID': array([3457624, 3552820, 3271083, ..., 3352838, 3118932, 3108958],
       shape=(590540,)),
 'TransactionDT': array([12153579, 15005886,  6970178, ...,  6213562,  9076295,  2614452],
       shape=(573349,)),
 'ProductCD': array(['W', 'C', 'R', 'H', 'S'], dtype=object),
 'card1': array([ 7826, 12544,  9400, ...,  3420,  3812, 12855], shape=(13553,)),
 'card4': array(['mastercard', 'visa', 'american express', 'discover', nan],
       dtype=object),
 'card6': array(['debit', 'credit', nan, 'debit or credit', 'charge card'],
       dtype=object),
 'P_emaildomain': array(['aol.com', 'yahoo.com', 'gmail.com', 'hotmail.com', 'icloud.com',
        nan, 'anonymous.com', 'mail.com', 'outlook.com', 'verizon.net',
        'bellsouth.net', 'hotmail.fr', 'msn.com', 'cox.net', 'comcast.net',
        'ymail.com', 'optonline.net', 'live.com', 'roadrunner.com',
        'att.net', 'aim.com', 'rocketmail.com', 'web.de', 'live.fr',
        'gmail', 'sbcglobal.net', 'hotmail.de', 'sc.rr.co

In [56]:
X_train_shuffled.query('select distinct(ProductCD)').head()


SyntaxError: invalid syntax (<unknown>, line 1)

# Training

In [109]:
# prompt: Help me Implement a State-of-the-Art Defender Model: The first objective is to build a high-performance "defender" system that represents a realistic target for an adversary. This will involve implementing a powerful classification model, such as a stacking ensemble of gradient boosting machines (e.g., XGBoost and LightGBM), which are known for their strong performance on tabular data. This model will serve as the "blue team" in the simulation. Based on the ieee_raw_train and test transaction and idendity datasets

# !pip install xgboost lightgbm

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

def build_stacking_ensemble_defender(df_train, df_test, y_train):
    """
    Builds and trains a stacking ensemble defender model using XGBoost and LightGBM.

    Args:
        df_train_trans: Training transaction DataFrame.
        df_train_id: Training identity DataFrame.
        df_test_trans: Testing transaction DataFrame.
        df_test_id: Testing identity DataFrame.

    Returns:
        The trained StackingClassifier model.
    """
    # Target variable
    y_train = y_train

    # Drop target and TransactionID from features
    X_train = df_train.drop(['TransactionID'], axis=1)
    X_test = df_test.drop(['TransactionID'], axis=1)

    # Align columns after merge and before training
    # This is important because the identity data might introduce new columns
    train_cols = X_train.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    missing_in_train = set(test_cols) - set(train_cols)
    for c in missing_in_train:
        X_train[c] = 0
    X_test = X_test[train_cols] # Ensure the order of columns is the same

    # Handle categorical features: Convert to 'category' dtype for LightGBM and one-hot encode for XGBoost/general.
    # For simplicity here, we'll use label encoding or let models handle directly where possible.
    # A more robust approach would involve more careful categorical feature handling.
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            # Simple label encoding for demonstration. Robust approach uses OneHotEncoding
            # or target encoding depending on the cardinality and model.
            # XGBoost and LightGBM can handle integers directly.
            X_train[col] = X_train[col].astype('category').cat.codes
            X_test[col] = X_test[col].astype('category').cat.codes

    # Handle missing values: Simple imputation with median.
    # A more robust approach would use more advanced imputation techniques or models that handle NaNs.
    for col in X_train.columns:
        if X_train[col].isnull().any():
            median_val = X_train[col].median()
            X_train[col] = X_train[col].fillna(median_val)
            if col in X_test.columns:
                X_test[col] = X_test[col].fillna(median_val)

    # Define base models
    # Using default or simple parameters, tuning is crucial for performance
    estimators = [
        ('xgb', xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False, random_state=42)),
        ('lgbm', lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42)),
    ]

    # Define meta-model
    # Using Logistic Regression as a simple meta-model
    meta_model = LogisticRegression(solver='liblinear')

    # Define stacking classifier
    # Using StratifiedKFold for cross-validation in stacking
    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=meta_model,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        stack_method='predict_proba', # Use probabilities for the meta-model
        verbose=1
    )

    print("Training the stacking ensemble model...")
    stacking_model.fit(X_train, y_train)
    print("Stacking ensemble model trained successfully.")

    return stacking_model, X_test # Return X_test as well for evaluation later


In [110]:
stacking_model, x_test = build_stacking_ensemble_defender(
    df_train = X_train_final,
    df_test = X_test_final,
    y_train = y_train_shuffled,
)


Training the stacking ensemble model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 20663, number of negative: 569877
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.321889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38667
[LightGBM] [Info] Number of data points in the train set: 590540, number of used features: 431
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034990 -> initscore=-3.317076
[LightGBM] [Info] Start training from score -3.317076


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min finished


[LightGBM] [Info] Number of positive: 16531, number of negative: 455901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.251721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38653
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 430
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034991 -> initscore=-3.317038
[LightGBM] [Info] Start training from score -3.317038
[LightGBM] [Info] Number of positive: 16531, number of negative: 455901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.241047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38678
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 430
[LightGBM]

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   49.3s finished


Stacking ensemble model trained successfully.


In [25]:
stacking_model

0,1,2
,estimators,"[('xgb', ...), ('lgbm', ...)]"
,final_estimator,LogisticRegre...r='liblinear')
,cv,StratifiedKFo... shuffle=True)
,stack_method,'predict_proba'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [27]:
stacking_model.estimators_

[XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='auc', feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...),
 LGBMClassifier(metric='auc', objective='binary', random_state=42)]

In [113]:
# Predict the class labels directly (0 or 1)
test_predictions = stacking_model.predict(X_test_final)

print("Successfully generated class predictions for the test set.")
print("Example predictions:", test_predictions[:5])

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:id-12: object, id-15: object, id-16: object, id-23: object, id-27: object, id-28: object, id-29: object, id-30: object, id-31: object, id-33: object, id-34: object, id-35: object, id-36: object, id-37: object, id-38: object