# Exploratory Data Analysis (EDA) - Fraud Detection Dataset (R)

This notebook performs comprehensive exploratory data analysis on the fraud detection dataset using R.


In [1]:
# Load libraries
library(tidyverse)
library(data.table)
library(ggplot2)
library(corrplot)
library(VIM)
library(naniar)

# Set options
options(warn = -1)
set.seed(42)

cat("Libraries loaded successfully!\n")


"package 'ggplot2' was built under R version 4.5.2"
"package 'stringr' was built under R version 4.5.2"
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.0     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: 'data.table'


The following objects are mask

ERROR: Error in library(naniar): there is no package called 'naniar'


In [None]:
# Load data
df <- read.csv("../../data/fraud_data.csv", stringsAsFactors = FALSE)
cat("Data loaded:", dim(df), "\n")
cat("Memory usage:", object.size(df) / 1024^2, "MB\n")


In [None]:
# Basic information
str(df)
summary(df)
head(df, 5)


In [None]:
# Target variable distribution
target_counts <- table(df$isFraud)
target_percentages <- prop.table(target_counts) * 100

cat("Target Distribution:\n")
print(target_counts)
cat("\nPercentages:\n")
print(target_percentages)

# Visualization
par(mfrow = c(1, 2))
barplot(target_counts, main = "Fraud Distribution (Count)", 
        col = c("#3498db", "#e74c3c"), names.arg = c("Legitimate", "Fraud"))
barplot(target_percentages, main = "Fraud Distribution (Percentage)", 
        col = c("#3498db", "#e74c3c"), names.arg = c("Legitimate", "Fraud"),
        ylab = "Percentage (%)")

fraud_rate <- mean(df$isFraud)
cat("\nFraud Rate:", fraud_rate, "(", fraud_rate * 100, "%)\n")
cat("Class Imbalance Ratio:", target_counts[1] / target_counts[2], ":1\n")


In [None]:
# Missing values analysis
missing_data <- df %>%
  summarise_all(~sum(is.na(.))) %>%
  gather(key = "Column", value = "Missing_Count") %>%
  filter(Missing_Count > 0) %>%
  arrange(desc(Missing_Count))

missing_data$Missing_Percentage <- (missing_data$Missing_Count / nrow(df)) * 100

cat("Missing Values Analysis:\n")
cat("Total columns with missing values:", nrow(missing_data), "\n")
cat("Total missing values:", sum(missing_data$Missing_Count), "\n")

if(nrow(missing_data) > 0) {
  print(head(missing_data, 20))
  
  # Visualization
  ggplot(head(missing_data, 30), aes(x = reorder(Column, Missing_Percentage), y = Missing_Percentage)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = "Top 30 Columns with Missing Values", 
         x = "Columns", y = "Missing Percentage (%)") +
    theme_minimal()
  ggsave("../../outputs/figures/missing_values_r.png", width = 12, height = 8, dpi = 300)
}


In [None]:
# Transaction Amount Analysis
if("TransactionAmt" %in% colnames(df)) {
  cat("Transaction Amount Analysis:\n")
  cat("Mean:", mean(df$TransactionAmt, na.rm = TRUE), "\n")
  cat("Median:", median(df$TransactionAmt, na.rm = TRUE), "\n")
  cat("Std:", sd(df$TransactionAmt, na.rm = TRUE), "\n")
  
  # Visualization
  p1 <- ggplot(df, aes(x = TransactionAmt)) +
    geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
    xlim(0, quantile(df$TransactionAmt, 0.99, na.rm = TRUE)) +
    labs(title = "Transaction Amount Distribution", x = "Transaction Amount", y = "Frequency") +
    theme_minimal()
  
  p2 <- ggplot(df, aes(x = as.factor(isFraud), y = TransactionAmt)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(title = "Transaction Amount by Fraud Status", x = "Fraud Status", y = "Transaction Amount") +
    theme_minimal()
  
  # Combine plots
  library(gridExtra)
  grid.arrange(p1, p2, ncol = 2)
  ggsave("../../outputs/figures/transaction_amount_analysis_r.png", width = 16, height = 6, dpi = 300)
  
  # Statistics by fraud status
  fraud_stats <- df %>%
    group_by(isFraud) %>%
    summarise(
      Count = n(),
      Mean = mean(TransactionAmt, na.rm = TRUE),
      Median = median(TransactionAmt, na.rm = TRUE),
      Std = sd(TransactionAmt, na.rm = TRUE)
    )
  print(fraud_stats)
}
