In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [9]:
file_name = "../../data/raw/NeurIPS/Base.csv"

# --- Load Data ---
try:
    df = pd.read_csv(file_name)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: CSV file not found at the specified path. Please check the 'data_source' variable.")
    exit()
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

Data loaded successfully.


In [10]:
# --- 2. Initial Data Cleaning (from EDA) ---
print("\nDelete column with all values the same")
df = df.drop(columns=['device_fraud_count'])

print("\nPerforming initial data cleaning (handling -1 as NaN)...")
# Columns where -1 represents a missing value based on domain knowledge
cols_with_minus_one_as_nan = [
    'prev_address_months_count', 
    'current_address_months_count', 
    'bank_months_count',
    'session_length_in_minutes'
]
for col in cols_with_minus_one_as_nan:
    if col in df.columns:
        df[col] = df[col].replace(-1, np.nan)
        # print(f"Replaced -1 with NaN in '{col}'. Missing now: {df[col].isnull().sum()}")


Delete column with all values the same

Performing initial data cleaning (handling -1 as NaN)...


In [11]:
# --- 3. Define Feature Types (based on domain knowledge) ---
print("\nDefining feature types...")
target = 'fraud_bool'

# Numerical features for imputation and scaling
numerical_features = [
    'income', 'name_email_similarity', 'prev_address_months_count', 
    'current_address_months_count', 'days_since_request', 'intended_balcon_amount',
    'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 
    'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
    'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes',
    'device_distinct_emails_8w'
]
# Filter out any columns not present in the loaded dataframe (e.g., if using a sample)
numerical_features = [col for col in numerical_features if col in df.columns]

# Categorical features for imputation and one-hot encoding
# customer_age and month are treated as categorical due to their binned/discrete nature
categorical_features = [
    'customer_age', 
    'payment_type', 
    'employment_status', 
    'housing_status',
    'source', 
    'device_os',
    'month'
]
categorical_features = [col for col in categorical_features if col in df.columns]

# Binary features (already 0/1) - will be passed through or scaled if needed later
# For this stage, we'll ensure they are not mistakenly one-hot encoded if they are part of X.
# They don't typically need imputation if they are clean 0/1.
binary_features = [
    'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 
    'has_other_cards', 'foreign_request', 'keep_alive_session'
]
binary_features = [col for col in binary_features if col in df.columns]

print(f"Target: {target}")
print(f"Numerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print(f"Binary features ({len(binary_features)}): {binary_features}")


Defining feature types...
Target: fraud_bool
Numerical features (17): ['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w']
Categorical features (7): ['customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'month']
Binary features (6): ['email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'keep_alive_session']


In [12]:
# --- 4. Separate Features (X) and Target (y) ---
if target not in df.columns:
    print(f"ERROR: Target column '{target}' not found in the DataFrame.")
    exit()

X = df.drop(columns=[target])
y = df[target]
print(f"\nShape of X: {X.shape}, Shape of y: {y.shape}")


Shape of X: (1000000, 30), Shape of y: (1000000,)


In [13]:
# --- 5. Preprocessing Pipelines ---
print("\nSetting up preprocessing pipelines...")

# Pipeline for numerical features: Median Imputation + Standard Scaling
numerical_pipeline = Pipeline([
    ('imputer_median', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # Important for SVM, good for XGBoost too
])

# Pipeline for categorical features: Mode Imputation + One-Hot Encoding
categorical_pipeline = Pipeline([
    ('imputer_mode', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # sparse_output=False for easier handling with SMOTE
])

# Create ColumnTransformer to apply pipelines to correct columns
# Binary features are currently passed through ('passthrough').
# If scaling is desired for binary features as well, they can be added to numerical_transformer.
# However, scaling 0/1 features usually doesn't change their meaning much unless they have NaNs.
# We ensure binary features are not in numerical or categorical lists to avoid double processing.
X_numerical = [col for col in numerical_features if col in X.columns]
X_categorical = [col for col in categorical_features if col in X.columns]
X_binary_passthrough = [col for col in binary_features if col in X.columns and col not in X_numerical and col not in X_categorical]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, X_numerical),
        ('cat', categorical_pipeline, X_categorical)
        # ('bin', 'passthrough', X_binary_passthrough) # Option to passthrough binary
    ], 
    remainder='passthrough' # Keep other columns (like binary ones not explicitly handled if any)
                            # or 'drop' if you only want processed columns
)
print("Preprocessor configured.")


Setting up preprocessing pipelines...
Preprocessor configured.


In [14]:
# --- 6. Split Data into Training and Testing sets ---
print("\nSplitting data into training and testing sets (80/20 split)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y # Crucial for imbalanced datasets
)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Fraud distribution in original y: \n{y.value_counts(normalize=True)}")
print(f"Fraud distribution in y_train: \n{y_train.value_counts(normalize=True)}")
print(f"Fraud distribution in y_test: \n{y_test.value_counts(normalize=True)}")



Splitting data into training and testing sets (80/20 split)...
X_train shape: (800000, 30), y_train shape: (800000,)
X_test shape: (200000, 30), y_test shape: (200000,)
Fraud distribution in original y: 
fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64
Fraud distribution in y_train: 
fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64
Fraud distribution in y_test: 
fraud_bool
0    0.98897
1    0.01103
Name: proportion, dtype: float64


In [15]:
# --- 7. Apply Preprocessing ---
# Fit the preprocessor on the training data and transform both training and testing data
print("\nApplying preprocessing (fitting on train, transforming train and test)...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after one-hot encoding for creating DataFrames (optional, but good for inspection)
try:
    # Get feature names from the 'cat' part of the preprocessor
    ohe_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(X_categorical)
    
    # Combine with numerical and passthrough columns
    # Order matters: numerical, then OHE categorical, then remainder
    processed_feature_names = X_numerical + list(ohe_feature_names)
    
    # Handling 'remainder' columns:
    # The 'remainder' columns are those not in X_numerical or X_categorical.
    # We need to get their names correctly based on the ColumnTransformer's behavior.
    # This part can be tricky if 'remainder' columns are not at the end or if their order changes.
    # A simpler way if you know the remainder columns:
    remainder_cols = [col for col in X_train.columns if col not in X_numerical and col not in X_categorical]
    processed_feature_names.extend(remainder_cols)

    # Convert processed arrays back to DataFrames (optional, for inspection)
    X_train_processed_df = pd.DataFrame(X_train_processed, columns=processed_feature_names, index=X_train.index)
    X_test_processed_df = pd.DataFrame(X_test_processed, columns=processed_feature_names, index=X_test.index)
    print("Processed training and testing data (X) are now NumPy arrays.")
    print(f"Shape of X_train_processed: {X_train_processed.shape}")
    print(f"Shape of X_test_processed: {X_test_processed.shape}")
    # print("First 5 rows of X_train_processed_df (for inspection):")
    # print(X_train_processed_df.head())
except Exception as e:
    print(f"Could not reconstruct feature names after OHE: {e}")
    print("Proceeding with NumPy arrays for X_train_processed and X_test_processed.")



Applying preprocessing (fitting on train, transforming train and test)...
Processed training and testing data (X) are now NumPy arrays.
Shape of X_train_processed: (800000, 66)
Shape of X_test_processed: (200000, 66)


In [26]:
print(processed_feature_names)

['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w', 'customer_age_10', 'customer_age_20', 'customer_age_30', 'customer_age_40', 'customer_age_50', 'customer_age_60', 'customer_age_70', 'customer_age_80', 'customer_age_90', 'payment_type_AA', 'payment_type_AB', 'payment_type_AC', 'payment_type_AD', 'payment_type_AE', 'employment_status_CA', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE', 'employment_status_CF', 'employment_status_CG', 'housing_status_BA', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF', 'housing_status_BG', 'source_INTERNET', 'source_

In [16]:
# --- 8. Resampling the Training Data using SMOTE ---
# SMOTE should only be applied to the training data
print("\nApplying SMOTE to the training data to handle class imbalance...")
# Check if there are enough samples in the minority class for SMOTE
# SMOTE's k_neighbors default is 5. It needs at least k_neighbors + 1 samples in the minority class.
minority_class_count = y_train.value_counts().min()
k_neighbors_smote = min(5, minority_class_count - 1) if minority_class_count > 1 else 1

if k_neighbors_smote < 1 and minority_class_count <=1 : # If minority class has 0 or 1 sample
    print(f"WARNING: Minority class in training set has only {minority_class_count} sample(s).")
    print("SMOTE cannot be applied effectively or may fail. Consider alternative strategies or more data.")
    print("Proceeding without SMOTE for now.")
    X_train_resampled = X_train_processed
    y_train_resampled = y_train
else:
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors_smote)
    try:
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
        print("SMOTE applied successfully.")
        print(f"Shape of X_train_resampled: {X_train_resampled.shape}")
        print(f"Shape of y_train_resampled: {y_train_resampled.shape}")
        print(f"Fraud distribution in y_train_resampled: \n{pd.Series(y_train_resampled).value_counts(normalize=True)}")
    except ValueError as e:
        print(f"Error during SMOTE: {e}")
        print("This might happen if the minority class count is too low even after adjusting k_neighbors.")
        print("Proceeding with original (imbalanced) training data for now.")
        X_train_resampled = X_train_processed
        y_train_resampled = y_train


Applying SMOTE to the training data to handle class imbalance...
SMOTE applied successfully.
Shape of X_train_resampled: (1582354, 66)
Shape of y_train_resampled: (1582354,)
Fraud distribution in y_train_resampled: 
fraud_bool
0    0.5
1    0.5
Name: proportion, dtype: float64


In [25]:
base_saving_place = "../../data/processed/NeurIPS"

# X_train_resampled = pd.DataFrame(X_train_resampled, columns=processed_feature_names, index=X_train.index)
# y_train_resampled = pd.DataFrame(y_train_resampled, columns=processed_feature_names, index=y_train.index)
# X_train_resampled.to_csv(f"{base_saving_place}/X_train_resampled.csv")
# y_train_resampled.to_csv(f"{base_saving_place}/y_train_resampled.csv")

X_train_processed_df.to_csv(f"{base_saving_place}/X_train_without_resampling.csv", index=False)
X_test_processed_df.to_csv(f"{base_saving_place}/X_test_without_resampling.csv", index=False)
y_train.to_csv(f"{base_saving_place}/y_train.csv", index=False)
y_test.to_csv(f"{base_saving_place}/y_test.csv", index=False)
X_train.to_csv(f"{base_saving_place}/X_train.csv", index=False)
X_test.to_csv(f"{base_saving_place}/X_test.csv", index=False)
