# Preparation of data

Here we will procede to prepare the data to be fed into our models.

## Imports

In [61]:
import os
from pathlib import Path

import pandas as pd
import pandas.api.types as pdt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib 

from src import paths, utils
from src.io_ops import read_csv_safely


import scipy.stats as stats

## Load the data

In [62]:
BASE_PATH = paths.RAW_DATA_DIR / "Base.csv"

df_base = read_csv_safely(BASE_PATH)
df_base.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

## Remove unwanted columns

As we checked in notebook 01 columns "device_fraud_count" is constant and so irrelevant for our model. We will remove it.

In [63]:

df_base.drop(columns=["device_fraud_count"], inplace=True)
df_base.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'month'],
      dtype='object')

In [64]:
df_base.shape

(1000000, 31)

## Train, Test, Val split.

We will divide the data in 3 splits using the month as an index for this division. We will first separate the categorical columns as they will have to be treated differently when fed into the model.

In [65]:
categorical_cols = ['device_os', 'employment_status', 'housing_status', 'payment_type', 'source']
target_col = 'fraud_bool'
index_col = 'month'
num_cols = [c for c in df_base.columns if c not in categorical_cols + [target_col, index_col]]
print(f"Numerical columns size: {len(num_cols)}")

Numerical columns size: 24


In [66]:
train_df = df_base[df_base[index_col].between(0, 4)].copy()
val_df   = df_base[df_base[index_col].between(5, 6)].copy()
test_df  = df_base[df_base[index_col] == 7].copy()

Encoding categorical columns and scaling numerical columns.

In [67]:
# here we generate a new encoder for each categorical column
# and we save it in a dictionary to be used later if needed.
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    val_df[col]   = val_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    test_df[col]  = test_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    encoders[col] = le

In the dataset negative values are used to indicate missing values. We will replace them with Nan and then impute them with the median of each column. We will also add a column that indicates if the value was missing or not.

In [68]:
for df in [train_df, val_df, test_df]:
    df[num_cols] = df[num_cols].mask(df[num_cols] < 0, np.nan)

In [69]:
# we use the median because it is more robust to outliers ans we keep the outliers because we want to detect an outlier (fraud).
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
    ("scaler", MinMaxScaler())
])

In [70]:
# Fit only on training data
train_df_num = num_pipeline.fit_transform(train_df[num_cols])

# Transform val/test
val_df_num = num_pipeline.transform(val_df[num_cols])
test_df_num = num_pipeline.transform(test_df[num_cols])


In [71]:
# Get the final column names from the fitted pipeline
all_num_cols = num_pipeline.named_steps["imputer"].get_feature_names_out(num_cols).tolist()
all_num_cols

['income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'missingindicator_prev_address_months_count',
 'missingindicator_current_address_months_count',
 'missingindicator_intended_balcon_amount',
 'missingindicator_velocity_6h',
 'missingindicator_credit_risk_score',
 'missingindicator_bank_months_count',
 'missingindicator_session_length_in_minutes',
 'missingindicator_device_distinct_emails_8w']

In [72]:
# Replace numerical data in DataFrames
train_df = pd.concat([
    pd.DataFrame(train_df_num, columns=all_num_cols, index=train_df.index),
    train_df[categorical_cols + [target_col]]
], axis=1)

val_df = pd.concat([
    pd.DataFrame(val_df_num, columns=all_num_cols, index=val_df.index),
    val_df[categorical_cols + [target_col]]
], axis=1)

test_df = pd.concat([
    pd.DataFrame(test_df_num, columns=all_num_cols, index=test_df.index),
    test_df[categorical_cols + [target_col]]
], axis=1)


In [73]:
train_df.head()

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,missingindicator_credit_risk_score,missingindicator_bank_months_count,missingindicator_session_length_in_minutes,missingindicator_device_distinct_emails_8w,device_os,employment_status,housing_status,payment_type,source,fraud_bool
0,0.25,0.986507,0.072581,0.058824,0.375,8.8e-05,0.907016,0.157934,0.783455,0.79632,...,0.0,0.0,0.0,0.0,0,1,2,0,0,0
1,0.875,0.617426,0.072581,0.209412,0.125,0.000132,0.287371,0.24735,0.551761,0.53732,...,0.0,0.0,0.0,0.0,2,0,2,3,0,0
2,0.875,0.996708,0.010753,0.032941,0.375,0.000161,0.287371,0.163308,0.267475,0.503709,...,0.0,0.0,0.0,0.0,3,0,2,1,0,0
3,0.625,0.4751,0.016129,0.032941,0.25,9.1e-05,0.287371,0.519779,0.863381,0.661561,...,0.0,0.0,0.0,0.0,0,0,2,1,0,0
4,1.0,0.842307,0.072581,0.068235,0.375,0.074987,0.417437,0.349007,0.454735,0.460912,...,0.0,0.0,0.0,0.0,2,0,2,0,0,0


## Extracting the splits

In [74]:
train_df.to_parquet(paths.PROCESSED_DATA_DIR / "train.parquet", index=False)
val_df.to_parquet(paths.PROCESSED_DATA_DIR / "val.parquet", index=False)
test_df.to_parquet(paths.PROCESSED_DATA_DIR / "test.parquet", index=False)


## Extra

In [75]:
#If we want we can save the encoders and the imputing pipeline for later use
joblib.dump(encoders, paths.PROCESSED_DATA_DIR / "label_encoders.joblib")
joblib.dump(num_pipeline, paths.PROCESSED_DATA_DIR / "num_pipeline.joblib")


['C:\\Users\\gonca\\OneDrive\\Ambiente de Trabalho\\MECD\\Ano_2\\Semestre1\\ACED\\AECD.Feedzai-Benchmarking-optimizers-and-learning-rate-schedulers\\data\\processed\\num_pipeline.joblib']

In [None]:
# to then load up our splits we can add this code to beggining of any script were we need the data
# ============================================================
# Load preprocessed data splits and preprocessing objects
# ============================================================
import joblib
import pandas as pd
import torch
from pathlib import Path

# Path setup
DATA_DIR = paths.PROCESSED_DATA_DIR  # assuming you have this imported somewhere
train_df = pd.read_parquet(DATA_DIR / "train.parquet")
val_df   = pd.read_parquet(DATA_DIR / "val.parquet")
test_df  = pd.read_parquet(DATA_DIR / "test.parquet")

# Load preprocessing artifacts (encoders + numeric pipeline)
encoders = joblib.load(DATA_DIR / "label_encoders.joblib")
num_pipeline = joblib.load(DATA_DIR / "num_pipeline.joblib")  # <- instead of scaler only

# ============================================================
# Define columns
# ============================================================
categorical_cols = ['device_os', 'employment_status', 'housing_status', 'payment_type', 'source']
target_col = 'fraud_bool'

# The numeric columns after preprocessing (imputer + indicators)
num_cols = [c for c in train_df.columns if c not in categorical_cols + [target_col]]

print(f"Final numerical columns: {len(num_cols)}")

# ============================================================
# Convert to tensors
# ============================================================
X_train_cont = torch.tensor(train_df[num_cols].values, dtype=torch.float32)
X_train_cat  = torch.tensor(train_df[categorical_cols].values, dtype=torch.long)
y_train      = torch.tensor(train_df[target_col].values, dtype=torch.float32)

X_val_cont = torch.tensor(val_df[num_cols].values, dtype=torch.float32)
X_val_cat  = torch.tensor(val_df[categorical_cols].values, dtype=torch.long)
y_val      = torch.tensor(val_df[target_col].values, dtype=torch.float32)

X_test_cont = torch.tensor(test_df[num_cols].values, dtype=torch.float32)
X_test_cat  = torch.tensor(test_df[categorical_cols].values, dtype=torch.long)
y_test      = torch.tensor(test_df[target_col].values, dtype=torch.float32)


Final numerical columns: 32


In [77]:
X_train_cont.shape, X_train_cat.shape, y_train.shape

(torch.Size([675666, 32]), torch.Size([675666, 5]), torch.Size([675666]))

In [None]:
# if one would want to infere on raw data, the following code could be used:
import pandas as pd
import numpy as np
import torch

def preprocess_for_inference(raw_df, encoders, num_pipeline, categorical_cols, target_col=None, index_col=None):
    """
    Preprocess raw input data for inference with a model trained on encoded and scaled features.
    Applies the same transformations as during training:
    - Encodes categorical variables using fitted LabelEncoders.
    - Replaces negative values in numeric columns with NaN.
    - Applies the fitted numeric pipeline (imputation + scaling + missing indicators).
    - Combines numerical and categorical features.
    - Converts the final arrays to PyTorch tensors.
    
    Parameters
    ----------
    raw_df : pd.DataFrame
        Input data before preprocessing.
    encoders : dict
        Dictionary of fitted LabelEncoders for categorical columns.
    num_pipeline : sklearn.pipeline.Pipeline
        Fitted pipeline for numeric data (imputer + scaler).
    categorical_cols : list of str
        Names of categorical columns.
    target_col : str or None
        Optional; target column name, if available.
    index_col : str or None
        Optional; column to drop if present (e.g., time/month index).
    
    Returns
    -------
    X_cont : torch.Tensor
        Continuous (numeric) features.
    X_cat : torch.Tensor
        Categorical features.
    y_true : torch.Tensor or None
        Target tensor if present, otherwise None.
    """
    
    df = raw_df.copy()
    
    # Drop index column if specified
    if index_col and index_col in df.columns:
        df = df.drop(columns=[index_col])
    
    # Encode categorical variables
    for col, le in encoders.items():
        df[col] = df[col].map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
    
    # Identify numeric columns
    num_cols = [c for c in df.columns if c not in categorical_cols + ([target_col] if target_col else [])]
    
    # Replace negative values (missing indicators) with NaN
    df[num_cols] = df[num_cols].mask(df[num_cols] < 0, np.nan)
    
    # Apply the fitted numeric pipeline
    num_transformed = num_pipeline.transform(df[num_cols])
    all_num_cols = num_pipeline.named_steps["imputer"].get_feature_names_out(num_cols)
    df_num = pd.DataFrame(num_transformed, columns=all_num_cols, index=df.index)
    
    # Combine numerical and categorical features
    processed_df = pd.concat([df_num, df[categorical_cols]], axis=1)
    
    # Convert to tensors
    X_cont = torch.tensor(processed_df[all_num_cols].values, dtype=torch.float32)
    X_cat = torch.tensor(processed_df[categorical_cols].values, dtype=torch.long)
    
    # Handle target column if present
    y_true = None
    if target_col and target_col in df.columns:
        y_true = torch.tensor(df[target_col].values, dtype=torch.float32)
    
    return X_cont, X_cat, y_true


'import joblib, pandas as pd\n\nencoders = joblib.load(paths.PROCESSED_DATA_DIR / "label_encoders.joblib")\nnum_pipeline = joblib.load(paths.PROCESSED_DATA_DIR / "num_pipeline.joblib")\n\n# Preprocess new data\nnew_df = raw_data.copy()\n\n# Encode categoricals\nfor col, le in encoders.items():\n    new_df[col] = new_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)\n\n# Handle numerical part (convert negatives to NaN)\nnum_cols = [c for c in new_df.columns if c not in categorical_cols + [target_col, index_col]]\nnew_df[num_cols] = new_df[num_cols].mask(new_df[num_cols] < 0, np.nan)\n\n# Apply fitted pipeline\nnew_df_num = num_pipeline.transform(new_df[num_cols])'