# R — Raw dataset exploratory analysis

This notebook reads the project's SPSS `.sav` dataset and performs quick exploratory analysis: dimensions, glimpse, summary, missingness, and a simple histogram using ggplot2.

Required packages: haven, dplyr, ggplot2, janitor

If a package is missing, install with:

install.packages(c('haven','dplyr','ggplot2','janitor'))


In [None]:
# Load libraries
library(haven)
library(dplyr)
library(ggplot2)
library(janitor)

# Default file path (repo-relative)
default_path <- "../6ix_Pac_Data_Stats520-Final-Project/data/raw/2025_ED_852_HERI_data.sav"
cat('Default dataset path:', default_path, '\n')

# Try reading the file
if (file.exists(default_path)) {
  df <- read_sav(default_path)
  cat('Loaded dataset successfully\n')
} else {
  cat('Default path not found. Please set `default_path` to the correct file path.\n')
  df <- NULL
}

# Quick EDA if loaded
if (!is.null(df)) {
  cat('Dimensions: ', dim(df), '\n')
  print(dplyr::glimpse(df))
  cat('\nSummary:\n')
  print(summary(df))
  cat('\nMissingness (count per column):\n')
  print(colSums(is.na(df))[order(-colSums(is.na(df)))[1:20]])

  # plot histogram of first numeric column
  num_cols <- names(df)[sapply(df, is.numeric)]
  if (length(num_cols) > 0) {
    p <- ggplot(as.data.frame(df), aes_string(x = num_cols[1])) +
      geom_histogram(bins=30, fill='steelblue', color='white') +
      theme_minimal() +
      ggtitle(paste('Histogram of', num_cols[1]))
    print(p)
  } else {
    cat('No numeric columns detected for plotting.\n')
  }
} else {
  cat('No data frame available. Notebook will stop here.\n')
}
