In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
# Load the data you cleaned in the EDA notebook
fraud_df = pd.read_csv('../data/processed/fraud_data_with_country.csv')

In [3]:
print(fraud_df.columns)

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class',
       'country'],
      dtype='object')


In [4]:
# 1. Time-based features [cite: 111]
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

In [5]:
# Calculate time difference in seconds [cite: 114]
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [6]:
# Calculate time difference in seconds [cite: 114]
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [7]:
# 2. Transaction frequency and velocity [cite: 110]
# Count how many times the same device was used
fraud_df['device_usage_count'] = fraud_df.groupby('device_id')['device_id'].transform('count')

In [8]:
# Count how many times the same IP was used
fraud_df['ip_usage_count'] = fraud_df.groupby('ip_address')['ip_address'].transform('count')

Data Transformation


In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# 1. Handle Categorical Variables (One-Hot Encoding)
# This prevents "ValueError: could not convert string to float"
categorical_cols = ['source', 'browser', 'sex', 'country']
# We use existing_cols to avoid KeyError if you've already dropped some
existing_cols = [c for c in categorical_cols if c in fraud_df.columns]

if existing_cols:
    fraud_df = pd.get_dummies(fraud_df, columns=existing_cols, drop_first=True)

# 2. Handle Missing Values (Imputation)
# Especially important for 'country' if some IPs didn't map correctly
fraud_df = fraud_df.fillna(0)

# 3. Normalization (Scaling)
# We scale numeric features so that 'purchase_value' doesn't overpower 'hour_of_day'
numeric_features = ['purchase_value', 'time_since_signup', 'device_usage_count', 'ip_usage_count']
scaler = MinMaxScaler()
fraud_df[numeric_features] = scaler.fit_transform(fraud_df[numeric_features])

# 4. Save the ENTIRE DataFrame
# This ensures Task 2 (Modeling) has all the data it needs
output_path = '../data/processed/fraud_features_final.csv'
fraud_df.to_csv(output_path, index=False)

print(f"✅ Success! Saved {fraud_df.shape[1]} columns to {output_path}")
print(f"Sample of columns now available: {list(fraud_df.columns[:10])}")

✅ Success! Saved 199 columns to ../data/processed/fraud_features_final.csv
Sample of columns now available: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'age', 'ip_address', 'class', 'time_since_signup', 'device_usage_count']


Handling Class Imbalance

In [10]:
# Separate features from target
# Added errors='ignore' so it won't crash if 'country' was already encoded
X = fraud_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 'country', 'ip_address'], axis=1, errors='ignore')

y = fraud_df['class']

print("Features (X) columns:", X.columns.tolist())

Features (X) columns: ['purchase_value', 'age', 'time_since_signup', 'device_usage_count', 'ip_usage_count', 'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_M', 'country_Albania', 'country_Algeria', 'country_Angola', 'country_Antigua and Barbuda', 'country_Argentina', 'country_Armenia', 'country_Australia', 'country_Austria', 'country_Azerbaijan', 'country_Bahamas', 'country_Bahrain', 'country_Bangladesh', 'country_Barbados', 'country_Belarus', 'country_Belgium', 'country_Belize', 'country_Benin', 'country_Bermuda', 'country_Bhutan', 'country_Bolivia', 'country_Bonaire; Sint Eustatius; Saba', 'country_Bosnia and Herzegowina', 'country_Botswana', 'country_Brazil', 'country_British Indian Ocean Territory', 'country_Brunei Darussalam', 'country_Bulgaria', 'country_Burkina Faso', 'country_Burundi', 'country_Cambodia', 'country_Cameroon', 'country_Canada', 'country_Cape Verde', 'country_Cayman Islands', 'country_Chile', 'country_China'

In [11]:
# Assuming 'fraud_df' is your main dataframe containing all features and the 'class' column

# 1. Verify the columns we have created
print("Columns to be saved:", fraud_df.columns.tolist())

# 2. Define the output path
# We save it to 'data/processed' so the modeling notebook can find it
output_path = '../data/processed/fraud_features_final.csv'

# 3. Save the entire DataFrame
# index=False prevents pandas from adding an extra 'Unnamed: 0' column
fraud_df.to_csv(output_path, index=False)

print(f"Successfully saved {fraud_df.shape[0]} rows and {fraud_df.shape[1]} columns to {output_path}")

Columns to be saved: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'age', 'ip_address', 'class', 'time_since_signup', 'device_usage_count', 'ip_usage_count', 'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_M', 'country_Albania', 'country_Algeria', 'country_Angola', 'country_Antigua and Barbuda', 'country_Argentina', 'country_Armenia', 'country_Australia', 'country_Austria', 'country_Azerbaijan', 'country_Bahamas', 'country_Bahrain', 'country_Bangladesh', 'country_Barbados', 'country_Belarus', 'country_Belgium', 'country_Belize', 'country_Benin', 'country_Bermuda', 'country_Bhutan', 'country_Bolivia', 'country_Bonaire; Sint Eustatius; Saba', 'country_Bosnia and Herzegowina', 'country_Botswana', 'country_Brazil', 'country_British Indian Ocean Territory', 'country_Brunei Darussalam', 'country_Bulgaria', 'country_Burkina Faso', 'country_Burundi', 'country_Cambodia', 'country_Cameroon', 'country_Canada', 'c