# Machine Learning Analysis - Fraud Detection (R)

This notebook implements machine learning models for fraud detection using R.


In [None]:
# Load libraries
library(tidyverse)
library(caret)
library(randomForest)
library(xgboost)
library(pROC)
library(ROCR)

# Load data
df <- read.csv("../../data/fraud_data.csv", stringsAsFactors = FALSE)
cat("Data loaded:", dim(df), "\n")
cat("Fraud rate:", mean(df$isFraud), "\n")


In [None]:
# Prepare data
key_features <- c("TransactionAmt", "card1", "card2", "card3", "card5", 
                  "addr1", "addr2", "dist1", "dist2")
key_features <- key_features[key_features %in% colnames(df)]

# Select features
X <- df[key_features]
y <- df$isFraud

# Handle missing values
X[is.na(X)] <- sapply(X, median, na.rm = TRUE)

cat("Features selected:", length(key_features), "\n")
cat("X shape:", dim(X), "\n")


In [None]:
# Split data
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

cat("Train set:", dim(X_train), "Fraud rate:", mean(y_train), "\n")
cat("Test set:", dim(X_test), "Fraud rate:", mean(y_test), "\n")


In [None]:
# Model 1: Logistic Regression
cat("Training Logistic Regression...\n")
lr_model <- glm(isFraud ~ ., data = cbind(X_train, isFraud = y_train), 
                family = binomial)
lr_pred <- predict(lr_model, newdata = X_test, type = "response")
lr_pred_class <- ifelse(lr_pred > 0.5, 1, 0)

cat("Logistic Regression trained!\n")
cat("AUC-ROC:", auc(roc(y_test, lr_pred)), "\n")
cat("Accuracy:", mean(lr_pred_class == y_test), "\n")


In [None]:
# Model 2: Random Forest
cat("Training Random Forest...\n")
rf_model <- randomForest(as.factor(isFraud) ~ ., 
                         data = cbind(X_train, isFraud = y_train),
                         ntree = 100, mtry = sqrt(ncol(X_train)))
rf_pred <- predict(rf_model, newdata = X_test, type = "prob")[, 2]
rf_pred_class <- predict(rf_model, newdata = X_test)

cat("Random Forest trained!\n")
cat("AUC-ROC:", auc(roc(y_test, rf_pred)), "\n")
cat("Accuracy:", mean(rf_pred_class == as.factor(y_test)), "\n")


In [None]:
# Model 3: XGBoost
cat("Training XGBoost...\n")
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)

xgb_model <- xgboost(data = dtrain, nrounds = 100, max_depth = 6, 
                     eta = 0.1, objective = "binary:logistic", 
                     eval_metric = "auc", verbose = 0)
xgb_pred <- predict(xgb_model, dtest)
xgb_pred_class <- ifelse(xgb_pred > 0.5, 1, 0)

cat("XGBoost trained!\n")
cat("AUC-ROC:", auc(roc(y_test, xgb_pred)), "\n")
cat("Accuracy:", mean(xgb_pred_class == y_test), "\n")


In [None]:
# Model comparison
models <- list(
  "Logistic Regression" = lr_pred,
  "Random Forest" = rf_pred,
  "XGBoost" = xgb_pred
)

# ROC curves
roc_curves <- lapply(models, function(pred) roc(y_test, pred))

# Plot ROC curves
plot(roc_curves[[1]], main = "ROC Curve Comparison", col = "red")
for(i in 2:length(roc_curves)) {
  lines(roc_curves[[i]], col = i + 1)
}
legend("bottomright", legend = names(models), col = 1:length(models), lwd = 2)

# Summary
comparison <- data.frame(
  Model = names(models),
  AUC_ROC = sapply(roc_curves, function(r) as.numeric(auc(r)))
)
print(comparison)
cat("\nML Analysis Complete!\n")
