# Feature Engineering – Fraud_Data.csv

## Task 1.4: Feature Engineering and Data Transformation

**Objective:**  
Create meaningful behavioral, temporal, and geographic features to improve fraud detection performance.


# Load and Prepare Data

In [53]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [54]:
# Load data set
raw_file = "../data/processed/Fraud_Data_eda.csv"
ip_file = "../data/raw/IpAddress_to_Country.csv"
df = pd.read_csv(raw_file)
ip_df = pd.read_csv(ip_file)

In [55]:
# Convert timestamps

df["signup_time"] = pd.to_datetime(df["signup_time"])
df["purchase_time"] = pd.to_datetime(df["purchase_time"])

In [56]:
df.shape

(129146, 17)

# Recreate Time-Based Features (Core Signals)

In [57]:
# Time since signup (hours)
df["time_since_signup"] = (
    df["purchase_time"] - df["signup_time"]
).dt.total_seconds() / 3600

# Hour of day
df["hour_of_day"] = df["purchase_time"].dt.hour

# Day of week
df["day_of_week"] = df["purchase_time"].dt.dayofweek

In [58]:
df.shape

(129146, 18)

Time-Based Features

- **time_since_signup** captures trust maturity.
- **hour_of_day** captures abnormal transaction timing.
- **day_of_week** captures weekly behavioral patterns.


# Transaction Velocity & Frequency (HIGH-VALUE FEATURES)
  This is behavioral fraud detection.

In [59]:
# sort first
df = df.sort_values(["user_id", "purchase_time"])
# Create transaction count per user
df["transaction_count_user"] = df.groupby("user_id").cumcount() + 1

#Transactions in last 24 hours:
df["transactions_last_24h"] = (
    df.groupby("user_id")["purchase_time"]
    .transform(lambda x: x.diff().dt.total_seconds().le(86400).cumsum())
)

In [60]:
df.shape

(129146, 20)

Transaction Velocity Features

- **transaction_count_user** captures repeat behavior.
- **transactions_last_24h** captures burst activity, a common fraud pattern.


# IP to Country

In [61]:
import ipaddress

# Robust ip_int extraction: use numeric ip_address if present, else convert dotted strings
if df['ip_address'].dtype.kind in 'iuf':  # int/uint/float => already numeric
    df['ip_int'] = df['ip_address'].astype(float)
    print("ip_address is numeric; used directly as ip_int.")
else:
    def ip_to_int(ip):
        try:
            return int(ipaddress.ip_address(str(ip)))
        except Exception:
            return np.nan
    df['ip_int'] = df['ip_address'].apply(ip_to_int)
    print("after ip_to_int: ", df.shape, "ip_int nulls:", df['ip_int'].isna().sum())

# Do NOT drop rows here — we'll mark unmatched IPs as 'unknown' after merging to avoid losing data.

# Convert to float to match the potential float64 dtype of df["ip_int"]
# due to NaN values generated by ip_to_int for invalid IPs, which forces
# the column to float even after dropping NaNs if the original array had floats.
ip_df["lower_bound_ip_address"] = ip_df["lower_bound_ip_address"].astype(float)
ip_df["upper_bound_ip_address"] = ip_df["upper_bound_ip_address"].astype(float)

# Identify and drop columns that might have been added by previous merges from ip_df
# This ensures a clean merge operation on re-execution.
cols_to_drop = [col for col in df.columns if 'bound_ip_address' in col or 'country' in col and col != 'country']
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

ip_df = ip_df.sort_values("lower_bound_ip_address")
df = df.sort_values("ip_int")

# Merge and then debug the result to make sure matches occurred
df = pd.merge_asof(
    df,
    ip_df,
    left_on="ip_int",
    right_on="lower_bound_ip_address",
    direction="backward",
    suffixes=('', '_geo') # Specify suffixes to avoid conflicts. '_geo' for ip_df columns.
)

print("after merge_asof:", df.shape)
if 'upper_bound_ip_address' in df.columns:
    print("upper_bound_ip_address nulls:", df['upper_bound_ip_address'].isna().sum())
    mask = df['ip_int'] <= df['upper_bound_ip_address']
    print("mask true:", mask.sum(), "mask false:", (~mask).sum())
    if mask.sum() < len(df):
        print("Sample rows failing the ip_int <= upper_bound check:")
        display(df.loc[~mask, ['ip_address','ip_int','lower_bound_ip_address','upper_bound_ip_address']].head(10))

    # Check numeric coverage between data and IP ranges
    try:
        print("ip_int range:", df['ip_int'].min(), df['ip_int'].max())
        print("ip_df range:", ip_df['lower_bound_ip_address'].min(), ip_df['upper_bound_ip_address'].max())
    except Exception as e:
        print("Error checking numeric ranges:", e)

    # Non-destructive fallback: mark unmatched rows as 'unknown' and keep them
    unmatched = df['upper_bound_ip_address'].isna() | (~mask)
    print("unmatched (upper_nan or > upper):", unmatched.sum())
    if unmatched.any():
        df.loc[unmatched, 'country'] = 'unknown'
        # Optional: clear bound columns for unmatched rows
        df.loc[unmatched, ['lower_bound_ip_address','upper_bound_ip_address']] = np.nan
        print(f"After marking unmatched country='unknown': df.shape {df.shape} (unmatched kept)")
else:
    print("WARNING: 'upper_bound_ip_address' not found in merged DataFrame. Columns:", df.columns.tolist())

# Note: we intentionally do NOT drop rows here to avoid losing data. If you prefer to remove unmatched rows, uncomment the following:
# if 'upper_bound_ip_address' in df.columns:
#     before_filter = len(df)
#     df = df[df['upper_bound_ip_address'].notna() & (df['ip_int'] <= df['upper_bound_ip_address'])]
#     print(f"after filtering ip range: {df.shape} (dropped {before_filter - len(df)} rows)")

ip_address is numeric; used directly as ip_int.
after merge_asof: (129146, 21)
upper_bound_ip_address nulls: 0
mask true: 129146 mask false: 0
ip_int range: 16778864.0719029 3758052531.47708
ip_df range: 16777216.0 3758096383.0
unmatched (upper_nan or > upper): 0


In [62]:
df.shape

(129146, 21)


Country information was derived from IP address ranges and added as a categorical risk feature.


In [63]:
# Drop Irrelevant or Leaky Columns

df = df.drop(columns=[
    "signup_time",
    "purchase_time",
    "ip_address",
    "lower_bound_ip_address",
    "upper_bound_ip_address"
])

In [64]:
df.shape

(129146, 16)


Raw timestamp and IP columns were removed after extracting meaningful features.


In [65]:
# Separate Target Variable and drop identifier columns
id_cols = ['user_id', 'device_id']
print("Dropping identifier columns (if present):", [c for c in id_cols if c in df.columns])

X = df.drop(columns=['class'] + id_cols, errors='ignore')
y = df['class'] if 'class' in df.columns else pd.Series(dtype=int)

# Encode Categorical Features safely
categorical_candidates = ['source', 'browser', 'sex', 'country']
categorical_features = [c for c in categorical_candidates if c in X.columns]

# Select numeric features only
numerical_features = X.select_dtypes(include=['number']).columns.tolist()

print("Categorical features:", categorical_features)
print("Numerical features (count):", len(numerical_features))
print("Non-numeric columns excluded from numeric features:", X.select_dtypes(exclude=['number']).columns.tolist())

# Build transformers only for existing features
transformers = []
if numerical_features:
    transformers.append(("num", StandardScaler(), numerical_features))
if categorical_features:
    transformers.append(("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features))

preprocessor = ColumnTransformer(transformers=transformers)



Dropping identifier columns (if present): ['user_id', 'device_id']
Categorical features: ['source', 'browser', 'sex', 'country']
Numerical features (count): 8
Non-numeric columns excluded from numeric features: ['source', 'browser', 'sex', 'country', 'country_geo']


In [66]:
# Debug: check sizes and target distribution after feature extraction
print("df.shape:", df.shape)
if 'class' in df.columns:
    print("class distribution:\n", df['class'].value_counts(dropna=False))
else:
    print("'class' column missing in DataFrame")

# Prepare X and y for the rest of the pipeline and print shapes for debugging
X = df.drop(columns='class') if 'class' in df.columns else df.copy()
y = df['class'] if 'class' in df.columns else pd.Series(dtype=int)
print("X.shape:", X.shape)
print("y.shape:", y.shape)
print("First few columns of X:", X.columns.tolist()[:20])

df.shape: (129146, 16)
class distribution:
 class
0    116878
1     12268
Name: count, dtype: int64
X.shape: (129146, 15)
y.shape: (129146,)
First few columns of X: ['user_id', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'time_since_signup', 'hour_of_day', 'ip_int', 'country', 'day_of_week', 'transaction_count_user', 'transactions_last_24h', 'country_geo']


Encoding and Scaling

- Numerical features were standardized.
- Categorical features were one-hot encoded.
- This ensures compatibility with linear and tree-based models.


# Save Processed Data

In [68]:
if not X.empty:
    X_processed = preprocessor.fit_transform(X)

    # If X_processed is sparse (e.g., CSR matrix from OneHotEncoder with sparse=True), convert to dense
    try:
        from scipy import sparse
        if sparse.issparse(X_processed):
            print("X_processed is sparse; converting to dense array before saving.")
            X_processed = X_processed.toarray()
    except Exception as e:
        print("scipy not available or conversion error:", e)

    print("X_processed shape:", getattr(X_processed, 'shape', None))

    X_df = pd.DataFrame(X_processed)
    X_df.to_csv(
        "../data/processed/fraud_features.csv",
        index=False
    )

    y.to_csv("../data/processed/fraud_target.csv", index=False)
else:
    print("DataFrame X is empty. Skipping feature processing and saving.")
    print("Please review previous steps to ensure data is not entirely dropped.")

X_processed is sparse; converting to dense array before saving.
X_processed shape: (129146, 199)


#Feature Engineering Summary
  1. Temporal features captured abnormal transaction timing.
  2. Velocity features captured burst and repeat behavior.
  3. Geographic features added contextual risk.
  4. Data was encoded and scaled for modeling.
  5. Final datasets were saved for reuse in modeling