In [None]:
# ============================================
# Load Fraud Detection Data from Amazon S3
# ============================================

# 1. Import required libraries
import boto3   # AWS SDK for Python - allows interaction with S3
import pandas as pd  # Data handling library

# 2. Define S3 bucket and object (file) path
bucket = "ml-rvce-us-east-1"                # Name of your S3 bucket
key = "fraud-detection/raw/creditcard.csv"  # Path to the dataset inside the bucket

# 3. Create an S3 client
s3 = boto3.client("s3")

# 4. Download the dataset from S3 to local machine
#    - First arg: bucket name
#    - Second arg: object key (file path in S3)
#    - Third arg: local filename to save as
s3.download_file(bucket, key, "creditcard.csv")

# 5. Load the downloaded CSV into a pandas DataFrame
df = pd.read_csv("creditcard.csv")

# 6. View the first 5 rows of the dataset
df.head()




In [7]:
# ============================================
# Preprocessing: Feature Engineering
# ============================================

import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. Scale the "Amount" column
# --------------------------------------------
# - Transaction amounts vary a lot (some are tiny, some are huge).
# - ML models often perform better when features are on a similar scale.
# - StandardScaler transforms values to have:
#     mean = 0, std = 1
# - We use double square brackets df[['Amount']] to keep it as a DataFrame
#   (since scaler expects 2D input).
scaler = StandardScaler()
df['Amount_Scaled'] = scaler.fit_transform(df[['Amount']])

# 2. Create "Hour of Day" feature from Time column
# --------------------------------------------
# - 'Time' is given in seconds since the first transaction.
# - To make it meaningful, we convert seconds → hours.
# - (df['Time'] // 3600) gives hours since start.
# - % 24 ensures values are in 0–23 (representing the hour of day).
df['Hour'] = (df['Time'] // 3600) % 24



In [8]:
# ============================================
# Feature Engineering: Behavior-Based Features
# ============================================

# 1. Compute the average transaction amount per hour
# -------------------------------------------------
# - Groups transactions by 'Hour'
# - Calculates the mean 'Amount' within each hour group
# - transform('mean') ensures the result aligns back to each row (same length as df)
df['MeanAmountByHour'] = df.groupby('Hour')['Amount'].transform('mean')

# 2. Create a ratio: transaction amount vs average amount for that hour
# ---------------------------------------------------------------------
# - Captures whether a transaction is unusually high/low compared to its hour's typical value
# - Example: if avg at 3AM = $20, and transaction = $2000 → ratio = 100 (potential anomaly)
# - Added 1e-6 (a very small number) to avoid division by zero
df['Amount_vs_HourlyMean'] = df['Amount'] / (df['MeanAmountByHour'] + 1e-6)



In [None]:
# ============================================
# Train-Test Split and Save to S3
# ============================================

from sklearn.model_selection import train_test_split

# 1. Separate features (X) and target (y)
# ---------------------------------------
# - 'Class' is the target variable (0 = normal, 1 = fraud)
# - X contains all other engineered features
X = df.drop(columns=['Class'])
y = df['Class']

# 2. Split into training and testing sets
# ---------------------------------------
# - test_size=0.2 → 20% for testing, 80% for training
# - stratify=y → preserves fraud/non-fraud ratio in both sets
# - random_state=42 → reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. Recombine features + target for saving
# -----------------------------------------
# - concat along axis=1 → puts 'Class' back as first column
train = pd.concat([y_train, X_train], axis=1)
test = pd.concat


In [10]:
import joblib
from xgboost import XGBClassifier

# Train model
model = XGBClassifier(scale_pos_weight=10, random_state=42)
model.fit(X_train, y_train)

# Save model locally
joblib.dump(model, "fraud_model.pkl")


['fraud_model.pkl']