In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

#Load Data
df = pd.read_csv("df_resampled.csv")

In [5]:
# Missing Value Indicators (batch creation)
# ---------------------------
missing_df = pd.DataFrame({
    f"{col}_missing": df[col].isna().astype(int)
    for col in df.columns
    if df[col].isna().sum() > 0
})



# Simple Imputation (median for numeric only)

df = df.copy()  # defragment dataframe
df.fillna(df.median(numeric_only=True), inplace=True)


# Log Transform Skewed Features (batch creation)

skewed_cols = [
    col for col in df.columns
    if df[col].dtype != 'O'           # not object
    and df[col].nunique() > 10        # continuous
    and (df[col] > 0).all()           # strictly positive
]

log_df = pd.DataFrame({
    f"{col}_log": np.log1p(df[col])
    for col in skewed_cols
})


# Interaction Features (batch creation)

interaction_pairs = [
    ("velocity_6h", "velocity_24h"),
    ("device_fraud_count", "device_distinct_emails_8w"),
    ("prev_address_months_count", "current_address_months_count"),
]

interaction_dict = {}

for f1, f2 in interaction_pairs:
    if f1 in df.columns and f2 in df.columns:
        interaction_dict[f"{f1}_x_{f2}"]    = df[f1] * df[f2]
        interaction_dict[f"{f1}_plus_{f2}"] = df[f1] + df[f2]
        interaction_dict[f"{f1}_ratio_{f2}"] = df[f1] / (df[f2] + 1)

interaction_df = pd.DataFrame(interaction_dict)


# Combine all engineered features at once

df_engineered = pd.concat([df, missing_df, log_df, interaction_df], axis=1)


# Save final dataset
df_engineered.to_csv("engineered_dataset.csv", index=False)

print("Feature engineering complete. File saved as engineered_dataset.csv")


Feature engineering complete. File saved as engineered_dataset.csv
