# Descriptive, Inferential, and Exploratory Statistical Analysis

## Cancer Incidence Data Analysis (R)

This notebook covers:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis

In [None]:
# Load libraries
library(readr)
library(dplyr)
library(ggplot2)
library(psych)
library(e1071)

# Source data loader
source('../../scripts/r/data_loader.R')
source('../../scripts/r/descriptive_stats.R')

In [None]:
# Load and clean data
df <- load_data('../../data/incd.csv')
df_clean <- clean_data(df)
cat('Dataset shape:', dim(df_clean), '\n')

## Descriptive Statistics

In [None]:
# Calculate descriptive statistics
stats <- calculate_descriptive_stats(df_clean)

## Inferential Statistics

In [None]:
# Normality tests
shapiro_test <- shapiro.test(sample(df_clean$Incidence_Rate, min(5000, length(df_clean$Incidence_Rate))))
cat('Shapiro-Wilk Test:', shapiro_test$statistic, 'p-value:', shapiro_test$p.value, '\n')

# Confidence intervals
mean_rate <- mean(df_clean$Incidence_Rate, na.rm = TRUE)
se <- sd(df_clean$Incidence_Rate, na.rm = TRUE) / sqrt(length(df_clean$Incidence_Rate))
ci_95 <- mean_rate + c(-1, 1) * qt(0.975, length(df_clean$Incidence_Rate) - 1) * se
cat('95% CI for mean:', ci_95, '\n')

## Exploratory Visualizations

In [None]:
# Distribution plots
ggplot(df_clean, aes(x = Incidence_Rate)) +
  geom_histogram(bins = 50, fill = 'steelblue', alpha = 0.7) +
  geom_vline(aes(xintercept = mean(Incidence_Rate)), color = 'red', linetype = 'dashed') +
  labs(title = 'Distribution of Incidence Rate')

ggplot(df_clean, aes(y = Incidence_Rate)) +
  geom_boxplot(fill = 'steelblue', alpha = 0.7) +
  labs(title = 'Box Plot of Incidence Rate')