# Fraud Detection Model Development

**Objective:** Build production-ready anomaly detection models for fraud detection

## Data Loading

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

df = pd.read_csv('transactions.csv')
print(f"Dataset shape: {df.shape}")

print(f"\nFraud rate: {df['is_fraud'].mean():.2%}")
df.head()

Dataset shape: (100000, 15)

Fraud rate: 2.00%


Unnamed: 0,transaction_id,customer_id,card_number,timestamp,amount,merchant_id,merchant_category,merchant_lat,merchant_long,is_fraud,fraud_type,hour,day_of_week,month,distance_from_home
0,TXN_00000000,CUST_00861,CARD_18969,2025-08-20T00:23:18Z,3117.77,MERCHANT_255,electronics,22.789,88.6956,1,account_takeover,0,2,8,36.03
1,TXN_00000001,CUST_03773,CARD_96975,2025-09-13T21:39:57Z,840.86,MERCHANT_212,electronics,13.0835,77.4841,0,none,21,5,9,5.07
2,TXN_00000002,CUST_03093,CARD_39004,2025-09-05T12:25:30Z,4070.5,MERCHANT_577,luxury_goods,18.6067,72.4481,1,merchant_collusion,12,4,9,15.6
3,TXN_00000003,CUST_00467,CARD_83329,2025-06-20T12:58:11Z,1176.75,MERCHANT_472,grocery,28.3219,76.7205,0,none,12,4,6,8.62
4,TXN_00000004,CUST_04427,CARD_75134,2025-07-28T19:38:56Z,306.8,MERCHANT_486,gas,28.316,76.6884,0,none,19,0,7,15.82


## Feature Engineering

In [2]:
from sklearn.preprocessing import LabelEncoder

def engineer_features(data):
    """Feature engineering pipeline"""
    df_features = data.copy()
    
    # 1. Amount features
    df_features['amount_log'] = np.log1p(df_features['amount'])
    df_features['amount_squared'] = df_features['amount'] ** 2
    
    # 2. Categorical features
    df_features['is_high_risk_category'] = df_features['merchant_category'].isin(
        ['jewelry', 'luxury_goods']
    ).astype(int)
    
    # Encode merchant category
    le = LabelEncoder()
    df_features['merchant_category_encoded'] = le.fit_transform(df_features['merchant_category'])
    
    # 3. Temporal features
    df_features['is_late_night'] = ((df_features['hour'] >= 23) | (df_features['hour'] <= 4)).astype(int)
    df_features['is_weekend'] = (df_features['day_of_week'] >= 5).astype(int)
    df_features['is_business_hours'] = ((df_features['hour'] >= 9) & (df_features['hour'] <= 17)).astype(int)
    
    # 4. Distance features
    df_features['distance_risk'] = (df_features['distance_from_home'] > 50).astype(int)
    df_features['distance_log'] = np.log1p(df_features['distance_from_home'])
    
    # 5. Interaction features
    df_features['amount_distance_interaction'] = df_features['amount'] * df_features['distance_from_home']
    df_features['high_amount_late_night'] = (
        (df_features['amount'] > df_features['amount'].quantile(0.9)) & 
        (df_features['is_late_night'] == 1)
    ).astype(int)
    
    return df_features

df_engineered = engineer_features(df)
print("Feature engineering completed")
print(f"Original features: {df.shape[1]}")
print(f"Engineered features: {df_engineered.shape[1]}")
print(f"\nNew features created:")
new_features = set(df_engineered.columns) - set(df.columns)
print(list(new_features))

Feature engineering completed
Original features: 15
Engineered features: 26

New features created:
['distance_log', 'amount_squared', 'is_weekend', 'merchant_category_encoded', 'amount_distance_interaction', 'high_amount_late_night', 'amount_log', 'is_business_hours', 'is_high_risk_category', 'is_late_night', 'distance_risk']


In [3]:
from sklearn.model_selection import train_test_split

feature_columns = [
    'amount', 'amount_log', 'amount_squared',
    'merchant_category_encoded', 'is_high_risk_category',
    'merchant_lat', 'merchant_long',
    'hour', 'day_of_week', 'month',
    'is_late_night', 'is_weekend', 'is_business_hours',
    'distance_from_home', 'distance_log', 'distance_risk',
    'amount_distance_interaction', 'high_amount_late_night'
]

X = df_engineered[feature_columns]
y = df_engineered['is_fraud']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training fraud rate: {y_train.mean():.2%}")
print(f"Test fraud rate: {y_test.mean():.2%}")

Training set: 80000 samples
Test set: 20000 samples
Training fraud rate: 2.00%
Test fraud rate: 2.00%


In [5]:
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_normal = X_train_scaled[y_train == 0]
print(f"\nNormal transactions for training: {X_train_normal.shape[0]}")

joblib.dump(scaler, 'models/scaler.pkl')
print("Scaler saved successfully")


Normal transactions for training: 78400
Scaler saved successfully
