# Create Staging Tables for Analysis

This notebook creates optimized staging tables from the enriched master dataset.

**What this does:**
- Reads the enriched master Parquet file (created by `create_enriched_master.ipynb`)
- Creates specialized staging tables for each analysis
- Outputs as **Parquet files** (small, fast) by default
- Optionally outputs as **CSV files** (large, but compatible with Excel/plotting software)

**Prerequisites:** Run `create_enriched_master.ipynb` first to create the enriched master file.

**Expected runtime:** 3-10 minutes depending on your machine.

## Configuration

In [None]:
# ===== USER CONFIGURATION =====

# Set to TRUE if you also want CSV versions (larger files, slower, but more compatible)
CREATE_CSV_FILES <- FALSE  # Change to TRUE if you want CSV versions

# Data paths
ENRICHED_MASTER_PATH <- '../data/sevis_f1_enriched_master.parquet'  # Enriched master file
STAGING_OUTPUT_DIR <- '../data/staging'  # Where to save staging tables

# Years to include (recommended: 2010-2022 for reliable data)
YEAR_FILTER <- "Year BETWEEN 2010 AND 2022"  # Change if you want different years

cat("Configuration:\n")
cat(sprintf("  Enriched master: %s\n", ENRICHED_MASTER_PATH))
cat(sprintf("  Output directory: %s\n", STAGING_OUTPUT_DIR))
cat("  Create Parquet: YES (default)\n")
cat(sprintf("  Create CSV: %s\n", ifelse(CREATE_CSV_FILES, "YES", "NO")))
cat(sprintf("  Year filter: %s\n", YEAR_FILTER))

# Check that enriched master exists
if (!file.exists(ENRICHED_MASTER_PATH)) {
  stop(sprintf("Enriched master file not found: %s\nPlease run create_enriched_master.ipynb first.", ENRICHED_MASTER_PATH))
}
cat("\n✓ Enriched master file found\n")

## Setup

In [None]:
library(duckdb)
library(DBI)

# Create staging directory if it doesn't exist
if (!dir.exists(STAGING_OUTPUT_DIR)) {
  dir.create(STAGING_OUTPUT_DIR, recursive = TRUE)
}

# Connect to DuckDB
con <- dbConnect(duckdb::duckdb())

cat(sprintf("✓ Staging directory: %s\n", normalizePath(STAGING_OUTPUT_DIR)))
cat("✓ DuckDB connection established\n")

## Helper Function

In [None]:
format_file_size <- function(size_bytes) {
  units <- c('B', 'KB', 'MB', 'GB', 'TB')
  unit_index <- 1
  size <- size_bytes
  
  while (size >= 1024 && unit_index < length(units)) {
    size <- size / 1024
    unit_index <- unit_index + 1
  }
  
  sprintf("%.1f %s", size, units[unit_index])
}

create_staging_table <- function(table_name, query, description) {
  cat(sprintf("\n%s\n", paste(rep("=", 60), collapse="")))
  cat(sprintf("Creating: %s\n", table_name))
  cat(sprintf("Purpose: %s\n", description))
  cat(sprintf("%s\n", paste(rep("=", 60), collapse="")))
  
  parquet_path <- file.path(STAGING_OUTPUT_DIR, paste0(table_name, ".parquet"))
  csv_path <- file.path(STAGING_OUTPUT_DIR, paste0(table_name, ".csv"))
  
  # Create Parquet file
  cat("\n[1/2] Creating Parquet file...\n")
  start_time <- Sys.time()
  
  dbExecute(con, sprintf("
    COPY (
      %s
    ) TO '%s' (FORMAT PARQUET, COMPRESSION ZSTD)
  ", query, parquet_path))
  
  parquet_time <- as.numeric(difftime(Sys.time(), start_time, units="secs"))
  parquet_size <- file.info(parquet_path)$size
  
  cat(sprintf("  ✓ Parquet created: %s\n", basename(parquet_path)))
  cat(sprintf("  ✓ Size: %s\n", format_file_size(parquet_size)))
  cat(sprintf("  ✓ Time: %.1f seconds\n", parquet_time))
  
  # Optionally create CSV file
  csv_size <- NULL
  if (CREATE_CSV_FILES) {
    cat("\n[2/2] Creating CSV file...\n")
    start_time <- Sys.time()
    
    dbExecute(con, sprintf("
      COPY (
        SELECT * FROM '%s'
      ) TO '%s' (HEADER, DELIMITER ',')
    ", parquet_path, csv_path))
    
    csv_time <- as.numeric(difftime(Sys.time(), start_time, units="secs"))
    csv_size <- file.info(csv_path)$size
    
    cat(sprintf("  ✓ CSV created: %s\n", basename(csv_path)))
    cat(sprintf("  ✓ Size: %s\n", format_file_size(csv_size)))
    cat(sprintf("  ✓ Time: %.1f seconds\n", csv_time))
    cat(sprintf("  ℹ Size comparison: CSV is %.1fx larger than Parquet\n", csv_size/parquet_size))
  } else {
    cat("\n[2/2] Skipping CSV creation (CREATE_CSV_FILES = FALSE)\n")
    cat("  ℹ To create CSV files, set CREATE_CSV_FILES = TRUE at the top of this notebook\n")
  }
  
  # Get row count
  row_count <- dbGetQuery(con, sprintf("SELECT COUNT(*) as count FROM '%s'", parquet_path))$count
  cat(sprintf("\n✓ %s complete: %s rows\n", table_name, format(row_count, big.mark=",")))
  
  list(
    table = table_name,
    rows = row_count,
    parquet_size = parquet_size,
    csv_size = csv_size
  )
}

## 1. Graduate Cohort OPT Analysis Staging Table

Creates a staging table with one row per person per fiscal year, showing:
- Whether they graduated that year
- Whether they used OPT within 180 days of graduation
- Demographic and program information

In [None]:
query_grad_cohort <- sprintf("
WITH base AS (
  SELECT
    Year AS fiscal_year,
    MAKE_DATE(Year - 1, 10, 1) AS fy_start,
    MAKE_DATE(Year, 9, 30) AS fy_end,

    -- Demographics and program info
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,

    -- Employment columns
    LOWER(TRIM(Employment_Description)) AS emp_desc,
    LOWER(TRIM(Employment_OPT_Type)) AS opt_type,

    -- Date columns
    TRY_CAST(Program_End_Date AS DATE) AS ped,
    TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    TRY_CAST(Program_Start_Date AS DATE) AS psd,

    SEVIS_ID
  FROM read_parquet('%s')
  WHERE %s
),

person_year_summary AS (
  SELECT
    fiscal_year,
    SEVIS_ID,

    -- Use ANY_VALUE to get demographic info
    ANY_VALUE(Campus_State) AS Campus_State,
    ANY_VALUE(CAMPUS_LMA) AS CAMPUS_LMA,
    ANY_VALUE(Student_Edu_Level_Desc) AS Student_Edu_Level_Desc,
    ANY_VALUE(IS_STEM) AS IS_STEM,
    ANY_VALUE(NSF_SUBJ_FIELD_BROAD) AS NSF_SUBJ_FIELD_BROAD,
    ANY_VALUE(Country_of_Birth) AS Country_of_Birth,
    ANY_VALUE(School_Name) AS School_Name,
    ANY_VALUE(CASE WHEN emp_desc = 'opt' THEN CAMPUS_LMA END) AS LMA_NAME,

    -- Graduation flag
    CASE WHEN COUNT(CASE WHEN ped IS NOT NULL AND ped BETWEEN fy_start AND fy_end THEN 1 END) > 0
         THEN 1 ELSE 0 END AS is_graduate,

    -- OPT flag
    CASE WHEN COUNT(CASE WHEN ped IS NOT NULL
                          AND ped BETWEEN fy_start AND fy_end
                          AND emp_desc = 'opt'
                          AND opt_type IN ('post-completion','stem')
                          AND TRY_CAST(auth_start_date AS DATE) > TRY_CAST(psd AS DATE)
                          AND ABS(DATE_DIFF('day', auth_start_date, ped)) <= 180
                     THEN 1 END) > 0
         THEN 1 ELSE 0 END AS used_opt
  FROM base
  GROUP BY fiscal_year, SEVIS_ID, fy_start, fy_end
)

SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  LMA_NAME,
  SEVIS_ID,
  is_graduate,
  used_opt
FROM person_year_summary
WHERE is_graduate = 1
ORDER BY fiscal_year, SEVIS_ID
", ENRICHED_MASTER_PATH, YEAR_FILTER)

result_1 <- create_staging_table(
  table_name = 'grad_cohort_opt',
  query = query_grad_cohort,
  description = 'Graduates by fiscal year with OPT usage within 180 days'
)

## 2. OPT Magnitude Staging Table

Creates a staging table showing all OPT authorizations active during each fiscal year.

In [None]:
query_opt_magnitude <- sprintf("
WITH prep AS (
  SELECT
    Year AS fiscal_year,
    MAKE_DATE(Year - 1, 10, 1) AS fy_start,
    MAKE_DATE(Year, 9, 30) AS fy_end,
    Employment_OPT_Type,
    Employer_State,
    EMPLOYER_LMA,
    Employment_Description,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    TRY_CAST(Program_Start_Date AS DATE) AS psd,
    TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    TRY_CAST(Authorization_End_Date AS DATE) AS auth_end_date
  FROM read_parquet('%s')
  WHERE %s
)
SELECT
  fiscal_year,
  Employer_State,
  EMPLOYER_LMA,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  SEVIS_ID
FROM prep
WHERE
      Employment_Description = 'opt'
  AND Employment_OPT_Type IN ('stem', 'post-completion')
  AND auth_start_date IS NOT NULL
  AND auth_end_date >= fy_start
  AND auth_start_date <= fy_end
  AND auth_start_date > psd
", ENRICHED_MASTER_PATH, YEAR_FILTER)

result_2 <- create_staging_table(
  table_name = 'opt_magnitude',
  query = query_opt_magnitude,
  description = 'All OPT authorizations active during each fiscal year'
)

## 3. Status Changes Staging Table

Creates a staging table showing graduates by fiscal year and whether they requested status changes.

In [None]:
query_status_changes <- sprintf("
WITH eligible AS (
  SELECT
    Year AS fiscal_year,
    TRIM(Campus_State) AS Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    Requested_Status,
    SEVIS_ID
  FROM read_parquet('%s')
  WHERE TRY_CAST(Program_End_Date AS DATE)
        BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
    AND %s
)
SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  COUNT(DISTINCT SEVIS_ID) AS completed_count,
  COUNT(DISTINCT CASE WHEN Requested_Status IS NOT NULL THEN SEVIS_ID END) AS completed_req_count,
  CAST(COUNT(DISTINCT CASE WHEN Requested_Status IS NOT NULL THEN SEVIS_ID END) AS DOUBLE) /
    NULLIF(COUNT(DISTINCT SEVIS_ID), 0) AS completed_req_frac
FROM eligible
GROUP BY
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name
ORDER BY
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name
", ENRICHED_MASTER_PATH, YEAR_FILTER)

result_3 <- create_staging_table(
  table_name = 'status_changes',
  query = query_status_changes,
  description = 'Graduates by fiscal year with status change requests'
)

## 4. Geographic Retention - Raw Metrics Staging Table

Creates foundational metrics (A-G) for geographic retention analysis.

In [None]:
query_geo_retention_raw <- sprintf("
WITH base AS (
  SELECT
    Year AS fiscal_year,
    MAKE_DATE(Year - 1, 10, 1) AS fy_start,
    MAKE_DATE(Year, 9, 30) AS fy_end,
    UPPER(TRIM(Campus_State)) AS Campus_State,
    TRIM(CAMPUS_LMA) AS CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    TRY_CAST(Program_End_Date AS DATE) AS ped,
    TRY_CAST(Program_Start_Date AS DATE) AS psd,
    TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    LOWER(TRIM(Employment_Description)) AS emp_desc,
    LOWER(TRIM(Employment_OPT_Type)) AS opt_type,
    UPPER(TRIM(Employer_State)) AS Employer_State,
    TRIM(EMPLOYER_LMA) AS Employer_LMA
  FROM read_parquet('%s')
  WHERE %s
),

grads AS (
  SELECT DISTINCT
    fiscal_year, Campus_State, CAMPUS_LMA,
    Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth, School_Name, SEVIS_ID
  FROM base
  WHERE ped IS NOT NULL
    AND ped BETWEEN fy_start AND fy_end
    AND Campus_State IS NOT NULL
),

opt_users AS (
  SELECT DISTINCT
    g.fiscal_year, g.Campus_State, g.CAMPUS_LMA,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD,
    g.Country_of_Birth, g.School_Name, g.SEVIS_ID
  FROM grads g
  WHERE EXISTS (
    SELECT 1 FROM base b
    WHERE b.SEVIS_ID = g.SEVIS_ID
      AND b.emp_desc = 'opt'
      AND b.opt_type IN ('post-completion', 'stem')
      AND b.auth_start_date IS NOT NULL
      AND b.auth_start_date > b.psd
      AND b.Employer_State IS NOT NULL
      AND TRIM(b.Employer_State) <> ''
  )
),

opt_same_state AS (
  SELECT DISTINCT
    o.fiscal_year, o.Campus_State, o.CAMPUS_LMA,
    o.Student_Edu_Level_Desc, o.IS_STEM, o.NSF_SUBJ_FIELD_BROAD,
    o.Country_of_Birth, o.School_Name, o.SEVIS_ID
  FROM opt_users o
  WHERE EXISTS (
    SELECT 1 FROM base b
    WHERE b.SEVIS_ID = o.SEVIS_ID
      AND b.emp_desc = 'opt'
      AND b.opt_type IN ('post-completion', 'stem')
      AND b.Employer_State = o.Campus_State
  )
),

opt_same_lma AS (
  SELECT DISTINCT
    o.fiscal_year, o.Campus_State, o.CAMPUS_LMA,
    o.Student_Edu_Level_Desc, o.IS_STEM, o.NSF_SUBJ_FIELD_BROAD,
    o.Country_of_Birth, o.School_Name, o.SEVIS_ID
  FROM opt_users o
  WHERE o.CAMPUS_LMA IS NOT NULL
    AND EXISTS (
      SELECT 1 FROM base b
      WHERE b.SEVIS_ID = o.SEVIS_ID
        AND b.emp_desc = 'opt'
        AND b.opt_type IN ('post-completion', 'stem')
        AND b.EMPLOYER_LMA = o.CAMPUS_LMA
    )
),

agg AS (
  SELECT
    g.fiscal_year, g.Campus_State, g.CAMPUS_LMA,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD,
    g.Country_of_Birth, g.School_Name,
    COUNT(DISTINCT g.SEVIS_ID) AS A_total_grads,
    COUNT(DISTINCT o.SEVIS_ID) AS B_opt_users,
    COUNT(DISTINCT ss.SEVIS_ID) AS C_opt_same_state,
    COUNT(DISTINCT sl.SEVIS_ID) AS D_opt_same_lma
  FROM grads g
  LEFT JOIN opt_users o USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
  LEFT JOIN opt_same_state ss USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                      IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
  LEFT JOIN opt_same_lma sl USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                    IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
  GROUP BY g.fiscal_year, g.Campus_State, g.CAMPUS_LMA, g.Student_Edu_Level_Desc,
           g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth, g.School_Name
)

SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  A_total_grads,
  B_opt_users,
  C_opt_same_state,
  D_opt_same_lma,
  CAST(B_opt_users AS DOUBLE) / NULLIF(A_total_grads, 0) AS E_opt_rate,
  CAST(C_opt_same_state AS DOUBLE) / NULLIF(B_opt_users, 0) AS F_state_retention_rate,
  CAST(D_opt_same_lma AS DOUBLE) / NULLIF(B_opt_users, 0) AS G_lma_retention_rate
FROM agg
ORDER BY fiscal_year, Campus_State, CAMPUS_LMA
", ENRICHED_MASTER_PATH, YEAR_FILTER)

result_4 <- create_staging_table(
  table_name = 'geographic_retention_raw_metrics',
  query = query_geo_retention_raw,
  description = 'Raw metrics (A-G) for geographic retention analysis'
)

## 5. Geographic Retention - Final Staging Table

Creates the final geographic retention table with computed retention rates at three levels.

In [None]:
query_geo_retention_final <- "
WITH raw AS (
  SELECT * FROM '../data/staging/geographic_retention_raw_metrics.parquet'
),

aggregated AS (
  SELECT
    fiscal_year,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SUM(A_total_grads) AS total_grads,
    SUM(B_opt_users) AS opt_users,
    SUM(C_opt_same_state) AS opt_same_state,
    SUM(D_opt_same_lma) AS opt_same_lma
  FROM raw
  GROUP BY fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
           IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name
)

SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  total_grads,
  opt_users,
  opt_same_state,
  opt_same_lma,
  CAST(opt_users AS DOUBLE) / NULLIF(total_grads, 0) AS opt_rate,
  CAST(opt_same_state AS DOUBLE) / NULLIF(opt_users, 0) AS state_retention,
  CAST(opt_same_lma AS DOUBLE) / NULLIF(opt_users, 0) AS lma_retention,
  CAST(opt_same_state AS DOUBLE) / NULLIF(total_grads, 0) AS national_retention
FROM aggregated
ORDER BY fiscal_year, Campus_State, CAMPUS_LMA
"

result_5 <- create_staging_table(
  table_name = 'geographic_retention',
  query = query_geo_retention_final,
  description = 'Geographic retention rates at national, state, and LMA levels'
)

## 6. State Retention - Flow Analysis Staging Table

Creates a staging table showing graduate flows from campus states to employer states.

In [None]:
query_state_retention <- sprintf("
WITH grad_cohort AS (
  SELECT DISTINCT
    Year AS fiscal_year,
    UPPER(TRIM(Campus_State)) AS Campus_State,
    TRIM(CAMPUS_LMA) AS CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    TRY_CAST(Program_End_Date AS DATE) AS ped
  FROM read_parquet('%s')
  WHERE TRY_CAST(Program_End_Date AS DATE)
        BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
    AND Campus_State IS NOT NULL
    AND %s
),

opt_rows_geo AS (
  SELECT
    SEVIS_ID,
    TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    UPPER(TRIM(Employer_State)) AS Employer_State,
    TRIM(EMPLOYER_LMA) AS Employer_LMA
  FROM read_parquet('%s')
  WHERE TRY_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
    AND LOWER(TRIM(Employment_Description)) = 'opt'
    AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
    AND TRY_CAST(Authorization_Start_Date AS DATE) > TRY_CAST(Program_Start_Date AS DATE)
    AND Employer_State IS NOT NULL
    AND TRIM(Employer_State) <> ''
    AND NOT REGEXP_MATCHES(LOWER(TRIM(Employer_State)), '^(n/?a|none|unknown|not applicable|null|)$')
),

cohort_opt AS (
  SELECT DISTINCT g.*
  FROM grad_cohort g
  WHERE EXISTS (
    SELECT 1 FROM opt_rows_geo o
    WHERE o.SEVIS_ID = g.SEVIS_ID
  )
),

flows_person AS (
  SELECT DISTINCT
    g.fiscal_year,
    g.Campus_State,
    g.CAMPUS_LMA,
    g.Student_Edu_Level_Desc,
    g.IS_STEM,
    g.NSF_SUBJ_FIELD_BROAD,
    g.Country_of_Birth,
    g.School_Name,
    g.SEVIS_ID,
    o.Employer_State,
    o.Employer_LMA
  FROM cohort_opt g
  JOIN opt_rows_geo o ON o.SEVIS_ID = g.SEVIS_ID
)

SELECT * FROM flows_person
", ENRICHED_MASTER_PATH, YEAR_FILTER, ENRICHED_MASTER_PATH)

result_6 <- create_staging_table(
  table_name = 'state_retention',
  query = query_state_retention,
  description = 'Graduate flows from campus states to employer states'
)

## 7. State Retention Nationally - Staging Table

Creates a staging table showing state-level retention rates across all US states.

In [None]:
query_state_retention_nationally <- sprintf("
WITH grad_cohort_all AS (
  SELECT DISTINCT
    Year AS fiscal_year,
    UPPER(TRIM(Campus_State)) AS Campus_State,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    SEVIS_ID
  FROM read_parquet('%s')
  WHERE TRY_CAST(Program_End_Date AS DATE)
        BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
    AND Campus_State IS NOT NULL
    AND %s
),

opt_rows_geo AS (
  SELECT
    SEVIS_ID,
    TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    UPPER(TRIM(Employer_State)) AS employer_state
  FROM read_parquet('%s')
  WHERE TRY_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
    AND LOWER(TRIM(Employment_Description)) = 'opt'
    AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
    AND TRY_CAST(Authorization_Start_Date AS DATE) > TRY_CAST(Program_Start_Date AS DATE)
    AND Employer_State IS NOT NULL
    AND TRIM(Employer_State) <> ''
    AND NOT REGEXP_MATCHES(LOWER(TRIM(Employer_State)), '^(n/?a|none|unknown|not applicable|null)$')
),

grad_cohort_opt_state AS (
  SELECT DISTINCT
    g.fiscal_year, g.Campus_State,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
    g.SEVIS_ID
  FROM grad_cohort_all g
  WHERE EXISTS (
    SELECT 1 FROM opt_rows_geo o WHERE o.SEVIS_ID = g.SEVIS_ID
  )
),

denom AS (
  SELECT
    fiscal_year, Campus_State,
    Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth,
    COUNT(DISTINCT SEVIS_ID) AS num_opt_users_with_valid_state
  FROM grad_cohort_opt_state
  GROUP BY fiscal_year, Campus_State, Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth
),

numer_same_state AS (
  SELECT
    g.fiscal_year, g.Campus_State,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
    COUNT(DISTINCT g.SEVIS_ID) AS num_worked_in_campus_state
  FROM grad_cohort_opt_state g
  WHERE EXISTS (
    SELECT 1
    FROM opt_rows_geo o
    WHERE o.SEVIS_ID = g.SEVIS_ID
      AND o.employer_state = g.Campus_State
  )
  GROUP BY g.fiscal_year, g.Campus_State, g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth
)

SELECT
  d.fiscal_year,
  d.Campus_State,
  d.Student_Edu_Level_Desc,
  d.IS_STEM,
  d.NSF_SUBJ_FIELD_BROAD,
  d.Country_of_Birth,
  d.num_opt_users_with_valid_state,
  COALESCE(n.num_worked_in_campus_state, 0) AS num_worked_in_campus_state,
  CAST(n.num_worked_in_campus_state AS DOUBLE) / NULLIF(d.num_opt_users_with_valid_state, 0)
    AS state_retention_among_opt_users_with_valid_state
FROM denom d
LEFT JOIN numer_same_state n USING (
  fiscal_year, Campus_State,
  Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth
)
", ENRICHED_MASTER_PATH, YEAR_FILTER, ENRICHED_MASTER_PATH)

result_7 <- create_staging_table(
  table_name = 'state_retention_nationally',
  query = query_state_retention_nationally,
  description = 'State-level retention rates across all US states'
)

## 8. Subject Fields Analysis (IPEDS Data)

**Note:** The subject fields analysis requires IPEDS (Integrated Postsecondary Education Data System) data, which is a separate data source from SEVIS. 

This staging table is **not created here** because it requires:
- IPEDS completions data by institution and major
- Matching IPEDS institutions to SEVIS schools
- CIP code mappings to NSF fields

If you have IPEDS data available, you can create this staging table separately. The analysis in the `subject_fields.ipynb` notebook expects a table with columns:
- `ipeds_academic_year` (INT)
- `INSTITUTION_NAME` (STRING)
- `SEVIS_mapping` (STRING) - degree level mapping to SEVIS categories
- `NSF_SUBJ_FIELD_BROAD` (STRING)
- `NONRES_TOTAL` (INT) - non-resident degree completions
- `GRAND_TOTAL` (INT) - total degree completions

For more information on obtaining IPEDS data, see: https://nces.ed.gov/ipeds/

## Summary

In [None]:
cat(sprintf("\n%s\n", paste(rep("=", 80), collapse="")))
cat("STAGING TABLES CREATION COMPLETE\n")
cat(sprintf("%s\n", paste(rep("=", 80), collapse="")))
cat(sprintf("\nOutput directory: %s\n", normalizePath(STAGING_OUTPUT_DIR)))

cat("\n✓ You can now use the individual analysis notebooks!\n")
cat("✓ Each notebook will read from the staging tables you just created.\n")

# Disconnect
dbDisconnect(con, shutdown = TRUE)

## Next Steps

1. **For analysis**: Open any of the `01_*`, `02_*`, etc. notebooks to analyze the data
2. **For custom queries**: You can query the staging tables directly:
   ```r
   library(duckdb)
   con <- dbConnect(duckdb::duckdb())
   result <- dbGetQuery(con, "
       SELECT * FROM '../data/staging/grad_cohort_opt.parquet'
       WHERE Campus_State = 'california'
   ")
   ```
3. **For CSV files**: If you need CSV versions for Excel or other software, re-run this notebook with `CREATE_CSV_FILES <- TRUE`