# Feature Engineering for Fraud Detection

## Objective
This notebook creates behavioral, temporal, and geolocation features
to enhance fraud detection performance.
## Feature Engineering Strategy

- Time-based features capture suspicious transaction timing.
- Velocity features detect automated or scripted behavior.
- Country features capture geo-risk patterns.
- No target leakage features are introduced.


üåç IP ‚Üí Country Merge


In [None]:

# Fraud Detection Feature Engineering Pipeline
# Allow imports from src/
import sys
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data_loader import load_fraud_data, load_ip_country_data
from src.preprocessing import clean_fraud_data


# Load raw data
df = load_fraud_data("../data/raw/Fraud_Data.csv")
ip_df = load_ip_country_data("../data/raw/IpAddress_to_Country.csv")

# Clean fraud data
df = clean_fraud_data(df)

df.head()


In [None]:
from src.geo_utils import convert_ip_to_int, merge_ip_country




fraud_df = convert_ip_to_int(df)
fraud_df = merge_ip_country(fraud_df, ip_df)
print(fraud_df.head())

fraud_df[["ip_address", "ip_int", "country"]].head()

Mapping IP addresses to countries enables the detection
of geographically anomalous transactions.


‚öôÔ∏è Time & Velocity Features

In [None]:
#üïí Time-Based Features
from src.feature_engineering import add_time_features, add_transaction_velocity

fraud_df = add_time_features(fraud_df)
fraud_df = add_transaction_velocity(fraud_df)
fraud_df[["hour_of_day", "day_of_week", "time_since_signup"]].head()

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create a 2x2 subplot layout
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle("Distribution of Engineered Time-Based Features", fontsize=18)

# 1Ô∏è‚É£ Time Since Signup (log-scaled)
sns.histplot(
    fraud_df["time_since_signup"],
    bins=50,
    log_scale=True,
    ax=axes[0, 0],
    kde=True
)
axes[0, 0].set_title("Time Since Signup (seconds, log scale)")
axes[0, 0].set_xlabel("Seconds")
axes[0, 0].set_ylabel("Count")

# 2Ô∏è‚É£ Hour of Day
sns.countplot(
    x="hour_of_day",
    data=fraud_df,
    ax=axes[0, 1]
)
axes[0, 1].set_title("Transaction Hour of Day")
axes[0, 1].set_xlabel("Hour (0‚Äì23)")
axes[0, 1].set_ylabel("Count")

# 3Ô∏è‚É£ Day of Week
sns.countplot(
    x="day_of_week",
    data=fraud_df,
    ax=axes[1, 0]
)
axes[1, 0].set_title("Transaction Day of Week")
axes[1, 0].set_xlabel("Day (0=Mon, 6=Sun)")
axes[1, 0].set_ylabel("Count")

# 4Ô∏è‚É£ Weekend Indicator
sns.countplot(
    x="is_weekend",
    data=fraud_df,
    ax=axes[1, 1]
)
axes[1, 1].set_title("Weekend vs Weekday Transactions")
axes[1, 1].set_xlabel("Is Weekend (1 = Yes, 0 = No)")
axes[1, 1].set_ylabel("Count")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


Time-based features capture behavioral patterns,
such as fraud occurring late at night or soon after signup.

In [None]:
# -------------------------------------------------------
# Transaction velocity feature handling (safe + clean)
# -------------------------------------------------------

REQUIRED_VELOCITY_COLS = {"transactions_last_1H", "transactions_last_24H"}

# Add velocity features only if missing
if not REQUIRED_VELOCITY_COLS.issubset(fraud_df.columns):
    fraud_df = add_transaction_velocity(fraud_df)

# Remove any accidental suffixed duplicates from prior merges
dup_cols = [
    c for c in fraud_df.columns
    if ("transactions_last" in c) and c.endswith(("_x", "_y"))
]
if dup_cols:
    fraud_df = fraud_df.drop(columns=dup_cols)
    
    
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# -------------------------------------------------------
# Transactions in last 1 hour (log-scaled)
# -------------------------------------------------------
sns.histplot(
    fraud_df["transactions_last_1H"],
    bins=40,
    log_scale=True,
    ax=axes[0]
)
axes[0].set_title("Log-Scaled Distribution: Transactions in Last 1 Hour")
axes[0].set_xlabel("Transaction Count (log scale)")
axes[0].set_ylabel("Frequency")

# -------------------------------------------------------
# Transactions in last 24 hours (log-scaled)
# -------------------------------------------------------
sns.histplot(
    fraud_df["transactions_last_24H"],
    bins=40,
    log_scale=True,
    ax=axes[1]
)
axes[1].set_title("Log-Scaled Distribution: Transactions in Last 24 Hours")
axes[1].set_xlabel("Transaction Count (log scale)")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()




# Sanity check
fraud_df[list(REQUIRED_VELOCITY_COLS)].head()







Fraud often occurs in bursts.
Velocity features quantify rapid transaction activity,
which is uncommon for legitimate users.

Numerical features will be scaled using StandardScaler
to support distance-based and gradient-based models.
Scaling is not applied to PCA features in the credit card dataset.

Both datasets exhibit severe class imbalance.
Resampling techniques such as SMOTE will be applied
only to training data during modeling to avoid information leakage.


üíæ Save Processed Data

In [None]:
# -------------------------------------------------------
# Compare transaction velocity by fraud class (KEY INSIGHT)
# -------------------------------------------------------

plt.figure(figsize=(8, 5))

sns.boxplot(
    x="class",
    y="transactions_last_24H",
    data=fraud_df
)

plt.yscale("log")
plt.title("Transaction Velocity (Last 24 Hours) by Fraud Class")
plt.xlabel("Fraud Label (0 = Legitimate, 1 = Fraud)")
plt.ylabel("Transactions in Last 24 Hours (log scale)")

plt.show()


In [None]:

#üíæ Save Processed Data
fraud_df.to_csv("../data/processed/fraud_data_features.csv", index=False)

## Model-Ready Preprocessing Pipeline


Feature / Target Separation

In [None]:

# Datetime conversion
df["signup_time"] = pd.to_datetime(df["signup_time"])
df["purchase_time"] = pd.to_datetime(df["purchase_time"])

# Time since signup (hours)
df["time_since_signup"] = (
    (df["purchase_time"] - df["signup_time"])
    .dt.total_seconds() / 3600
)

# OPTIONAL: velocity feature (skip if not ready)
# df["transactions_last_24h"] = ...

#assert "time_since_signup" in df.columns
# assert "transactions_last_24h" in df.columns


In [None]:
print("DataFrame used for modeling:")
print(df.columns)

X = df.drop(columns=["class"])
y = df["class"]

print(X.columns)
print(y.name)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


Feature Type Identification

In [None]:
num_features = [
    "purchase_value",
    "time_since_signup",
    "age"
]

cat_features = [
    "browser",
    "source",
    "sex"
]


In [None]:
required_features = num_features + cat_features
missing = [c for c in required_features if c not in X_train.columns]
assert not missing, f"Missing features: {missing}"


Scaling & Encoding Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)
print(df.columns)# Ensure datetime types
df["signup_time"] = pd.to_datetime(df["signup_time"])
df["purchase_time"] = pd.to_datetime(df["purchase_time"])

# Time since signup (in hours)
df["time_since_signup"] = (
    (df["purchase_time"] - df["signup_time"])
    .dt.total_seconds() / 3600
)

df["time_since_signup"].describe()



In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(X_train_processed.shape)
print(X_test_processed.shape)



Train/Test Split (BEFORE SMOTE)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


Apply Scaling & Encoding

In [None]:
print("Numeric features expected by pipeline:", num_features)
print("Columns actually in X_train:", list(X_train.columns))
print("Columns actually in X_test:", list(X_test.columns))

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(X_train.columns)
print(X_test.columns)

print(X_train_processed.shape)
print(X_test_processed.shape)


Class Imbalance Handling (SMOTE)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_processed,
    y_train
)


print("Before SMOTE:")
print(y_train.value_counts(normalize=True))

print("\nAfter SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))
print(X_train_resampled.shape)
print(y_train_resampled.shape)
# Save Processed and Resampled Data
np.save("../data/processed/X_train.npy", X_train_resampled)
np.save("../data/processed/y_train.npy", y_train_resampled)
np.save("../data/processed/X_test.npy", X_test_processed)
np.save("../data/processed/y_test.npy", y_test)
# EDA of Fraud Data
