# Statistical Analysis - Fraud Detection Dataset (R)

This notebook performs comprehensive statistical analysis including:

1. **Descriptive Statistics** - Mean, median, mode, standard deviation, variance, skewness, kurtosis
2. **Inferential Statistics** - Hypothesis testing, confidence intervals, t-tests, chi-square tests
3. **Exploratory Statistics** - Correlation analysis, feature relationships, statistical tests


In [None]:
# Load libraries
library(tidyverse)
library(car)
library(psych)
library(corrplot)
library(ggplot2)
library(gridExtra)
library(dplyr)

# Set options
options(warn = -1)
set.seed(42)

cat("Libraries loaded successfully!\n")

# Load data
df <- read.csv("../../data/fraud_data.csv", stringsAsFactors = FALSE)
cat("Data loaded:", dim(df), "\n")
cat("Target variable distribution:\n")
print(table(df$isFraud))


In [None]:
# Descriptive statistics
key_features <- c("TransactionAmt", "card1", "card2", "card3", "card5")
key_features <- key_features[key_features %in% colnames(df)]

if(length(key_features) > 0) {
  desc_stats <- describe(df[key_features])
  print(desc_stats)
  
  # Statistics by fraud status
  if("TransactionAmt" %in% key_features) {
    fraud_stats <- df %>%
      group_by(isFraud) %>%
      summarise(
        Mean = mean(TransactionAmt, na.rm = TRUE),
        Median = median(TransactionAmt, na.rm = TRUE),
        Std = sd(TransactionAmt, na.rm = TRUE),
        Skewness = psych::skew(TransactionAmt, na.rm = TRUE),
        Kurtosis = psych::kurtosi(TransactionAmt, na.rm = TRUE)
      )
    print(fraud_stats)
  }
}


In [None]:
# Inferential statistics: t-test
if("TransactionAmt" %in% colnames(df)) {
  fraud_amt <- df$TransactionAmt[df$isFraud == 1]
  legit_amt <- df$TransactionAmt[df$isFraud == 0]
  
  # Mann-Whitney U test (non-parametric)
  test_result <- wilcox.test(fraud_amt, legit_amt, alternative = "two.sided")
  cat("Mann-Whitney U Test:\n")
  print(test_result)
  
  # Confidence intervals
  fraud_ci <- t.test(fraud_amt)$conf.int
  legit_ci <- t.test(legit_amt)$conf.int
  cat("\n95% Confidence Intervals:\n")
  cat("Fraud transactions:", fraud_ci, "\n")
  cat("Legitimate transactions:", legit_ci, "\n")
}


## 2. Inferential Statistics


In [None]:
# Chi-square test
if("ProductCD" %in% colnames(df)) {
  contingency_table <- table(df$ProductCD, df$isFraud)
  chi_test <- chisq.test(contingency_table)
  cat("Chi-square Test: ProductCD and Fraud\n")
  print(chi_test)
  print(contingency_table)
}


In [None]:
# Correlation analysis
key_features <- c("TransactionAmt", "card1", "card2", "card3", "card5", "isFraud")
key_features <- key_features[key_features %in% colnames(df)]

if(length(key_features) > 1) {
  corr_matrix <- cor(df[key_features], use = "complete.obs")
  
  # Visualization
  corrplot(corr_matrix, method = "color", type = "upper", 
           order = "hclust", tl.cex = 0.8, tl.col = "black")
  
  # Correlation with fraud
  if("isFraud" %in% key_features) {
    fraud_corr <- corr_matrix[, "isFraud"]
    fraud_corr <- sort(fraud_corr, decreasing = TRUE)
    cat("Correlation with Fraud:\n")
    print(fraud_corr)
  }
}


## 3. Exploratory Statistics
