# Geolocation + Feature Engineering
## Step 5: IP to Country merge
### 5.1 Load IP mapping data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")

### 5.2 Convert IPs to integers

In [3]:
fraud_df["ip_address"] = fraud_df["ip_address"].astype(np.int64)
ip_df["lower_bound_ip_address"] = ip_df["lower_bound_ip_address"].astype(np.int64)
ip_df["upper_bound_ip_address"] = ip_df["upper_bound_ip_address"].astype(np.int64)

### 5.3 Range-based IP lookup

In [4]:
def ip_to_country(ip):
    match = ip_df[
        (ip_df["lower_bound_ip_address"] <= ip) &
        (ip_df["upper_bound_ip_address"] >= ip)
    ]
    if len(match) == 0:
        return "Unknown"
    return match.iloc[0]["country"]

fraud_df["country"] = fraud_df["ip_address"].apply(ip_to_country)


### 5.4 Fraud rate by country

In [5]:
country_fraud = fraud_df.groupby("country")["class"].mean().sort_values(ascending=False)
country_fraud.head(10)

country
Turkmenistan             1.000000
Namibia                  0.434783
Sri Lanka                0.419355
Luxembourg               0.388889
Virgin Islands (U.S.)    0.333333
Ecuador                  0.264151
Tunisia                  0.262712
Peru                     0.260504
Bolivia                  0.245283
Kuwait                   0.233333
Name: class, dtype: float64

## Step 6: Feature engineering
### 6.1 Time-based features

In [9]:
import pandas as pd

# Ensure datetime format (fixes .dt accessor errors)
fraud_df["signup_time"] = pd.to_datetime(
    fraud_df["signup_time"], errors="coerce"
)
fraud_df["purchase_time"] = pd.to_datetime(
    fraud_df["purchase_time"], errors="coerce"
)

# Drop rows with invalid timestamps
fraud_df = fraud_df.dropna(subset=["signup_time", "purchase_time"])

# Hour of day (0â€“23)
fraud_df["hour_of_day"] = fraud_df["purchase_time"].dt.hour

# Day of week (0=Monday, 6=Sunday)
fraud_df["day_of_week"] = fraud_df["purchase_time"].dt.dayofweek

# Time since signup in hours
fraud_df["time_since_signup"] = (
    fraud_df["purchase_time"] - fraud_df["signup_time"]
).dt.total_seconds() / 3600

# Remove negative values if any exist
fraud_df["time_since_signup"] = fraud_df["time_since_signup"].clip(lower=0)


### 6.2 Transaction velocity features

In [11]:
# Ensure datetime
fraud_df["purchase_time"] = pd.to_datetime(fraud_df["purchase_time"])

# Sort correctly
fraud_df = fraud_df.sort_values(["user_id", "purchase_time"])

# Transactions in last 1 hour
fraud_df["transactions_1h"] = (
    fraud_df
    .groupby("user_id")
    .rolling("1h", on="purchase_time")
    .count()["purchase_time"]
    .reset_index(level=0, drop=True)
)

# Transactions in last 24 hours
fraud_df["transactions_24h"] = (
    fraud_df
    .groupby("user_id")
    .rolling("24h", on="purchase_time")
    .count()["purchase_time"]
    .reset_index(level=0, drop=True)
)


### 6.3 Time since last transaction

In [12]:
fraud_df["time_since_last_tx"] = (
    fraud_df.groupby("user_id")["purchase_time"]
    .diff()
    .dt.total_seconds()
    .fillna(0)
)

### 6.4 Drop raw timestamps

In [13]:
fraud_df = fraud_df.drop(columns=["signup_time", "purchase_time"])

### Step 7: Encoding and scaling

In [14]:
from sklearn.preprocessing import StandardScaler

### 7.1 One-hot encoding


In [15]:
categorical_cols = ["browser", "source", "sex", "country"]
fraud_df = pd.get_dummies(fraud_df, columns=categorical_cols, drop_first=True)

### 7.2 Scale numerical features

In [25]:
print(fraud_df.columns.tolist())


['user_id', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class', 'hour_of_day', 'day_of_week', 'time_since_signup', 'time_since_last_tx']


In [29]:

# Ensure datetime
fraud_df["signup_time"] = pd.to_datetime(fraud_df["signup_time"])
fraud_df["purchase_time"] = pd.to_datetime(fraud_df["purchase_time"])


In [32]:
fraud_df = fraud_df.sort_values(["user_id", "purchase_time"])

# Set index for rolling calculation
fraud_df.set_index("purchase_time", inplace=True)

# 1-hour rolling transaction count per user
fraud_df["transactions_1h"] = (
    fraud_df.groupby("user_id")["user_id"]
    .rolling("1h")
    .count()
    .reset_index(level=0, drop=True)
)

# 24-hour rolling
fraud_df["transactions_24h"] = (
    fraud_df.groupby("user_id")["user_id"]
    .rolling("24h")
    .count()
    .reset_index(level=0, drop=True)
)

# Reset index if needed
fraud_df.reset_index(inplace=True)


### 7.3 Save processed data

In [33]:
fraud_df.to_csv("../data/processed/fraud_data_processed.csv", index=False)