The data for this project comes from IBM and their work on TabFormers in Python.  This is a great dataset to use for credit card activity to try and detect fraudulent activity from valid activity. https://ibm.ent.box.com/v/tabformer-data

I will be using a clustering algorithim to try and group fraudulent activity together to pinpoint activity that may be suspicious.

In [0]:
cli::cli_alert(glue::glue("{Sys.time()} - Loading packages"))
suppressMessages(library(glue))
suppressMessages(library(readr))
suppressMessages(library(magrittr))
suppressMessages(library(dplyr))
suppressMessages(library(dbscan))
suppressMessages(library(Rtsne))
suppressMessages(library(ggplot2))
suppressMessages(library(sparklyr))
suppressMessages(library(dbplyr))
suppressMessages(library(caret))
suppressMessages(library(pROC))
cli::cli_alert_success(glue("{Sys.time()} - Packages Loaded"))

In [0]:
cli::cli_alert(glue("{Sys.time()} - Connecting to spark"))
sc <- spark_connect(method = "databricks")
cli::cli_alert_success(glue("{Sys.time()} - Connected to spark"))

In [0]:
cli::cli_alert(glue("{Sys.time()} - Loading Data"))
card_activity <- spark_read_csv(sc, "/Volumes/rando/default/rando_vols/card_transaction.v1(1).csv")
cli::cli_alert_success(glue("{Sys.time()} - Data Loaded"))

In [0]:
# Looking at loaded data
print(head(card_activity))
print(colnames(card_activity))

In [0]:
cli::cli_alert(glue("{Sys.time()} - Data cleaning started"))

card_activity_cleaned <- card_activity %>%
  select(!c(Errors)) %>%
  mutate(
    Amount = as.double(regexp_replace(Amount,'\\\\$','')),
    Use_Chip = case_when(
      Use_Chip == 'Swipe Transaction' ~ 1,
      Use_Chip == 'Online Transaction' ~ 2,
      Use_Chip == 'Chip Transaction' ~ 3
    ),
    Is_Fraud = case_when(
      Is_Fraud == 'No' ~ 0,
      TRUE ~ 1
    )) %>%
  na.omit()

cli::cli_alert_success(glue("{Sys.time()} - Data cleaning finished"))
cli::cli_alert(glue("{Sys.time()} - Feature engineering started"))

card_activity_cities <- card_activity_cleaned %>%
  select(Merchant_City) %>%
  distinct() %>%
  window_order(Merchant_City) %>%
  mutate(city_row_num = row_number())

user_stats <- card_activity_cleaned %>%
  group_by(User) %>%
  summarize(
    avg_amount = mean(Amount, na.rm = TRUE),
    med_amount = percentile(Amount, 0.5),
    max_amount = max(Amount, na.rm = TRUE),
    min_amount = min(Amount, na.rm = TRUE),
    sd_amount = sd(Amount, na.rm = TRUE)
  )

home_states <- card_activity_cleaned %>%
  group_by(User, Merchant_State) %>%
  summarize(.groups = "keep", count = n()) %>%
  arrange(User, desc(count)) %>%
  group_by(User) %>%
  filter(row_number() == 1) %>%
  select(User, home_state = Merchant_State)

merchant_risk <- card_activity_cleaned %>%
  group_by(Merchant_Name) %>%
  summarise(
    merchant_txn_count = n(),
    merchant_fraud_rate = sum(Is_Fraud) / n()
  )

mcc_risk <- card_activity_cleaned %>%
  group_by(MCC) %>%
  summarise(
    mcc_txn_count = n(),
    mcc_fraud_rate = sum(Is_Fraud) / n()
  )

card_activity_updated <- card_activity_cleaned %>%
  mutate(
    hour = hour(Time),
    day_of_week = dayofweek(Time),
    is_weekend = ifelse(dayofweek(Time) %in% c(1,7), 1, 0),
    is_night = ifelse(hour(Time) >= 21 | hour(Time) <= 6, 1, 0)
    ) %>%
  left_join(card_activity_cities,by = c("Merchant_City")) %>%
  left_join(user_stats, by = c("User")) %>%
  left_join(home_states, by = c("User")) %>%
  left_join(merchant_risk, by = c("Merchant_Name")) %>%
  left_join(mcc_risk, by = c("MCC")) %>%
  mutate(
    amount_to_avg_ratio = Amount / avg_amount,
    amount_to_max_ratio = Amount / max_amount,
    amount_min_avg = Amount - avg_amount,
    amount_zscore = (Amount - avg_amount) / sd_amount,
    diff_state = ifelse(Merchant_State != home_state, 1, 0)
    ) %>%
  compute("card_activity_updated")

cli::cli_alert_success(glue("{Sys.time()} - Feature engineering finished"))


In [0]:
cli::cli_alert(glue("{Sys.time()} - Starting model prep"))

set.seed(123)
fraud_cases <- card_activity_updated %>%
  filter(Is_Fraud == 1)

non_fraud_sample <- card_activity_updated %>%
  filter(Is_Fraud == 0) %>%
  sdf_sample(fraction = (29757*10)/24357143, replacement = FALSE)

final_sample <- fraud_cases %>%
  sdf_bind_rows(non_fraud_sample)

feature_columns <- final_sample %>%
  select(where(is.numeric), -c(Is_Fraud)) %>%
  colnames()

assembled <- final_sample %>%
    ft_vector_assembler(
      input_cols = feature_columns,
      output_col = "model_features"
    )

pca_pipeline <- ml_pipeline(sc) %>%
  ft_vector_assembler(
    input_cols = feature_columns,
    output_col = "features"
  ) %>%
  ft_pca(
    input_col = "features",
    output_col = "pca_features",
    k = 15
  )

pca_model <- ml_fit(pca_pipeline, final_sample)
pca_stage <- ml_stage(pca_model, "pca")
explained_var <- pca_stage$explained_variance
total_var <- cumsum(explained_var) / sum(explained_var)

splits <- assembled %>%
  sdf_random_split(training = 0.7, testing = 0.3)

cli::cli_alert_success(glue("{Sys.time()} - Model prep completed"))

In [0]:
cli::cli_alert(glue("{Sys.time()} - Training, Testing, Evaluation"))

rf_model <- ml_random_forest_classifier(
  splits$training,
  formula = Is_Fraud ~ model_features,
  num_trees = 100
)

predictions <- ml_predict(rf_model, splits$testing)

eval <- ml_binary_classification_evaluator(
  predictions, 
  labeel_col = "Is_Fraud",
  prediction_col = "prediction",
  metric_name = "areaUnderROC"
)
cat(glue("Aread under ROC: {eval}\n"))

predict_local <- predictions %>%
  select(Is_Fraud, prediction) %>%
  collect()

predict_local$Is_Fraud <- factor(predict_local$Is_Fraud, levels = c(0, 1), labels = c("Non-Fraud", "Fraud"))
predict_local$prediction <- factor(predict_local$prediction, levels = c(0, 1), labels = c("Non-Fraud", "Fraud"))

conf_matrix <- confusionMatrix(predict_local$prediction, predict_local$Is_Fraud, positive = "Fraud")

print(conf_matrix)

cli::cli_alert_success(glue("{Sys.time()} - Model completed"))

In [0]:
cli::cli_alert(glue("{Sys.time()} - Creating confusion matrix"))
conf_data <- as.data.frame(conf_matrix$table)
colnames(conf_data) <- c("Predicted", "Reference", "Freq")

ggplot(data = conf_data, aes(x = Reference, y = Predicted, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = sprintf("%d", Freq)), vjust = 1) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal() +
  labs(title = "Confusion Matrix") +
  theme(plot.title = element_text(hjust = 0.5))

ggsave("confusion_matrix.png")

In [0]:
cli::cli_alert(glue("{Sys.time()} - Plotting feature importance"))
importance <- ml_feature_importances(rf_model)
imp_values <- unlist(importance)
importance_df <- data.frame(
  feature = names(imp_values),
  importance = as.numeric(imp_values)
  )
importance_df <- importance_df %>% arrange(desc(importance))

ggplot(importance_df[1:10,], aes(x = reorder(feature, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Top 10 Feature Importance",
    x = "Features",
    y = "Importance"
  )

ggsave("feature_import.png", bg = "white")

In [0]:
cli::cli_alert(glue("{Sys.time()} - Plotting FPR"))
fpr <- 1 - conf_matrix$byClass["Specificity"]
#cat(glue("False Positive Rate: {fpr}\n"))
prob_local <- predictions %>%
select(Is_Fraud, probability) %>%
collect()

prob_local$prob_fraud <- sapply(prob_local$probability, function(x) x[2])

roc_object <- roc(prob_local$Is_Fraud, prob_local$prob_fraud)
roc_coords <- coords(roc_object, "all")
thresholds <- roc_coords$threshold
fpr_values <- 1 - roc_coords$specificity
threshold_data <- data.frame(
  Threshold = thresholds,
  FPR = fpr_values,
  TPR = roc_coords$sensitivity
)

ggplot(threshold_data, aes(x = Threshold, y = FPR)) +
  geom_line(color = "red", linewidth = 1.2) +
  theme_minimal() +
  labs(
    title = "False Positive Rate vs Threshold",
    x = "Probability Threshold",
    y = "False Positive Rate"
  )

ggsave("fp_rate.png")