# Create Staging Tables for Analysis

This notebook creates optimized staging tables from the enriched master dataset.

**What this does:**
- Reads the enriched master Parquet file (created by `create_enriched_master.ipynb`)
- Creates specialized staging tables for each analysis
- Outputs as **Parquet files** to `../data/dta/`

**Prerequisites:** Run `create_enriched_master.ipynb` first to create the enriched master file.

**Expected runtime:** 5-15 minutes depending on your machine.

In [None]:
import duckdb
import os

# Connect to DuckDB
con = duckdb.connect()
print("✓ Connected to DuckDB")

## Configuration

In [None]:
# File paths
ENRICHED_MASTER_PATH = '../data/sevis_f1_enriched_master.parquet'
DTA_OUTPUT_DIR = '../data/dta'

# Create output directory if needed
os.makedirs(DTA_OUTPUT_DIR, exist_ok=True)

# Year filter for reliable data
YEAR_MIN = 2010
YEAR_MAX = 2022

print(f"✓ Configuration loaded")
print(f"  Input: {ENRICHED_MASTER_PATH}")
print(f"  Output: {DTA_OUTPUT_DIR}/")
print(f"  Year range: {YEAR_MIN}-{YEAR_MAX}")

## 1. Graduate Cohort OPT Table

Creates a table showing graduates by fiscal year and whether they used OPT within 180 days.

In [None]:
print("Creating grad_cohort_opt_plot1.parquet...")

con.execute(f"""
    COPY (
      WITH base AS (
        SELECT
          Year AS fiscal_year,
          MAKE_DATE(Year - 1, 10, 1) AS fy_start,
          MAKE_DATE(Year, 9, 30) AS fy_end,
          Campus_State,
          CAMPUS_LMA,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          School_Name,
          LOWER(TRIM(Employment_Description)) AS emp_desc,
          LOWER(TRIM(Employment_OPT_Type)) AS opt_type,
          TRY_CAST(Program_End_Date AS DATE) AS ped,
          TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
          TRY_CAST(Program_Start_Date AS DATE) AS psd,
          SEVIS_ID
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      ),
      
      person_year_summary AS (
        SELECT
          fiscal_year,
          SEVIS_ID,
          ANY_VALUE(Campus_State) AS Campus_State,
          ANY_VALUE(CAMPUS_LMA) AS CAMPUS_LMA,
          ANY_VALUE(Student_Edu_Level_Desc) AS Student_Edu_Level_Desc,
          ANY_VALUE(IS_STEM) AS IS_STEM,
          ANY_VALUE(NSF_SUBJ_FIELD_BROAD) AS NSF_SUBJ_FIELD_BROAD,
          ANY_VALUE(Country_of_Birth) AS Country_of_Birth,
          ANY_VALUE(School_Name) AS School_Name,
          ANY_VALUE(CASE WHEN emp_desc = 'opt' THEN CAMPUS_LMA END) AS LMA_NAME,
          
          CASE WHEN COUNT(CASE WHEN ped IS NOT NULL AND ped BETWEEN fy_start AND fy_end THEN 1 END) > 0
               THEN 1 ELSE 0 END AS is_graduate,
          
          CASE WHEN COUNT(CASE WHEN ped IS NOT NULL
                                AND ped BETWEEN fy_start AND fy_end
                                AND emp_desc = 'opt'
                                AND opt_type IN ('post-completion','stem')
                                AND TRY_CAST(auth_start_date AS DATE) > TRY_CAST(psd AS DATE)
                                AND ABS(DATE_DIFF('day', auth_start_date, ped)) <= 180
                           THEN 1 END) > 0
               THEN 1 ELSE 0 END AS used_opt
        FROM base
        GROUP BY fiscal_year, SEVIS_ID, fy_start, fy_end
      )
      
      SELECT
        fiscal_year,
        Campus_State,
        CAMPUS_LMA,
        Student_Edu_Level_Desc,
        IS_STEM,
        NSF_SUBJ_FIELD_BROAD,
        Country_of_Birth,
        School_Name,
        LMA_NAME,
        SEVIS_ID,
        is_graduate,
        used_opt
      FROM person_year_summary
      WHERE is_graduate = 1
      ORDER BY fiscal_year, SEVIS_ID
    ) TO '{DTA_OUTPUT_DIR}/grad_cohort_opt_plot1.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: grad_cohort_opt_plot1.parquet")

## 2. OPT Magnitude Table

Creates a table showing all OPT authorizations active during each fiscal year.

In [None]:
print("Creating opt_magnitude_plot2.parquet...")

con.execute(f"""
    COPY (
      WITH prep AS (
        SELECT
          Year AS fiscal_year,
          MAKE_DATE(Year - 1, 10, 1) AS fy_start,
          MAKE_DATE(Year, 9, 30) AS fy_end,
          Employment_OPT_Type,
          Employer_State,
          EMPLOYER_LMA,
          Employment_Description,
          Campus_State,
          CAMPUS_LMA,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          School_Name,
          SEVIS_ID,
          TRY_CAST(Program_Start_Date AS DATE) AS psd,
          TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
          TRY_CAST(Authorization_End_Date AS DATE) AS auth_end_date
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      )
      
      SELECT
        fiscal_year,
        Employer_State,
        EMPLOYER_LMA,
        Campus_State,
        CAMPUS_LMA,
        Student_Edu_Level_Desc,
        IS_STEM,
        NSF_SUBJ_FIELD_BROAD,
        Country_of_Birth,
        School_Name,
        SEVIS_ID
      FROM prep
      WHERE Employment_Description = 'opt'
        AND Employment_OPT_Type IN ('stem', 'post-completion')
        AND auth_start_date IS NOT NULL
        AND auth_end_date >= fy_start
        AND auth_start_date <= fy_end
        AND auth_start_date > psd
    ) TO '{DTA_OUTPUT_DIR}/opt_magnitude_plot2.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: opt_magnitude_plot2.parquet")

## 3. Status Changes Table

Creates a table showing graduates and their status change requests.

In [None]:
print("Creating status_changes.parquet...")

con.execute(f"""
    COPY (
      WITH eligible AS (
        SELECT
          Year AS fiscal_year,
          TRIM(Campus_State) AS Campus_State,
          CAMPUS_LMA,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          School_Name,
          Requested_Status,
          SEVIS_ID
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE TRY_CAST(Program_End_Date AS DATE)
              BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
          AND Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      )
      
      SELECT
        fiscal_year,
        Campus_State,
        CAMPUS_LMA,
        Student_Edu_Level_Desc,
        IS_STEM,
        NSF_SUBJ_FIELD_BROAD,
        Country_of_Birth,
        School_Name,
        COUNT(DISTINCT SEVIS_ID) AS completed_count,
        COUNT(DISTINCT CASE WHEN Requested_Status IS NOT NULL THEN SEVIS_ID END) AS completed_req_count,
        CAST(COUNT(DISTINCT CASE WHEN Requested_Status IS NOT NULL THEN SEVIS_ID END) AS DOUBLE) /
          NULLIF(COUNT(DISTINCT SEVIS_ID), 0) AS completed_req_frac
      FROM eligible
      GROUP BY fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
               IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name
      ORDER BY fiscal_year, Campus_State
    ) TO '{DTA_OUTPUT_DIR}/status_changes.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: status_changes.parquet")

## 4. Geographic Retention Table

Creates a table showing graduate flows between campus and employer locations.

In [None]:
print("Creating geographic_retention.parquet...")

con.execute(f"""
    COPY (
      WITH base AS (
        SELECT
          Year AS fiscal_year,
          MAKE_DATE(Year - 1, 10, 1) AS fy_start,
          MAKE_DATE(Year, 9, 30) AS fy_end,
          UPPER(TRIM(Campus_State)) AS Campus_State,
          TRIM(CAMPUS_LMA) AS CAMPUS_LMA,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          School_Name,
          SEVIS_ID,
          TRY_CAST(Program_End_Date AS DATE) AS ped,
          TRY_CAST(Program_Start_Date AS DATE) AS psd,
          TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
          LOWER(TRIM(Employment_Description)) AS emp_desc,
          LOWER(TRIM(Employment_OPT_Type)) AS opt_type,
          UPPER(TRIM(Employer_State)) AS Employer_State,
          TRIM(EMPLOYER_LMA) AS Employer_LMA
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      ),
      
      grads AS (
        SELECT DISTINCT
          fiscal_year, Campus_State, CAMPUS_LMA,
          Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth, School_Name, SEVIS_ID
        FROM base
        WHERE ped IS NOT NULL
          AND ped BETWEEN fy_start AND fy_end
          AND Campus_State IS NOT NULL
      ),
      
      opt_users AS (
        SELECT DISTINCT
          g.fiscal_year, g.Campus_State, g.CAMPUS_LMA,
          g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD,
          g.Country_of_Birth, g.School_Name, g.SEVIS_ID
        FROM grads g
        WHERE EXISTS (
          SELECT 1 FROM base b
          WHERE b.SEVIS_ID = g.SEVIS_ID
            AND b.emp_desc = 'opt'
            AND b.opt_type IN ('post-completion', 'stem')
            AND b.auth_start_date IS NOT NULL
            AND b.auth_start_date > b.psd
            AND b.Employer_State IS NOT NULL
            AND TRIM(b.Employer_State) <> ''
        )
      ),
      
      opt_same_state AS (
        SELECT DISTINCT
          o.fiscal_year, o.Campus_State, o.CAMPUS_LMA,
          o.Student_Edu_Level_Desc, o.IS_STEM, o.NSF_SUBJ_FIELD_BROAD,
          o.Country_of_Birth, o.School_Name, o.SEVIS_ID
        FROM opt_users o
        WHERE EXISTS (
          SELECT 1 FROM base b
          WHERE b.SEVIS_ID = o.SEVIS_ID
            AND b.emp_desc = 'opt'
            AND b.opt_type IN ('post-completion', 'stem')
            AND b.Employer_State = o.Campus_State
        )
      ),
      
      opt_same_lma AS (
        SELECT DISTINCT
          o.fiscal_year, o.Campus_State, o.CAMPUS_LMA,
          o.Student_Edu_Level_Desc, o.IS_STEM, o.NSF_SUBJ_FIELD_BROAD,
          o.Country_of_Birth, o.School_Name, o.SEVIS_ID
        FROM opt_users o
        WHERE o.CAMPUS_LMA IS NOT NULL
          AND EXISTS (
            SELECT 1 FROM base b
            WHERE b.SEVIS_ID = o.SEVIS_ID
              AND b.emp_desc = 'opt'
              AND b.opt_type IN ('post-completion', 'stem')
              AND b.EMPLOYER_LMA = o.CAMPUS_LMA
          )
      )
      
      SELECT
        g.fiscal_year, g.Campus_State, g.CAMPUS_LMA,
        g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD,
        g.Country_of_Birth, g.School_Name,
        COUNT(DISTINCT g.SEVIS_ID) AS A_total_grads,
        COUNT(DISTINCT o.SEVIS_ID) AS B_opt_users,
        COUNT(DISTINCT ss.SEVIS_ID) AS C_opt_same_state,
        COUNT(DISTINCT sl.SEVIS_ID) AS D_opt_same_lma,
        CAST(COUNT(DISTINCT o.SEVIS_ID) AS DOUBLE) / NULLIF(COUNT(DISTINCT g.SEVIS_ID), 0) AS E_opt_rate,
        CAST(COUNT(DISTINCT ss.SEVIS_ID) AS DOUBLE) / NULLIF(COUNT(DISTINCT o.SEVIS_ID), 0) AS F_state_retention_rate,
        CAST(COUNT(DISTINCT sl.SEVIS_ID) AS DOUBLE) / NULLIF(COUNT(DISTINCT o.SEVIS_ID), 0) AS G_lma_retention_rate
      FROM grads g
      LEFT JOIN opt_users o USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                    IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
      LEFT JOIN opt_same_state ss USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                          IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
      LEFT JOIN opt_same_lma sl USING (fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc,
                                        IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name, SEVIS_ID)
      GROUP BY g.fiscal_year, g.Campus_State, g.CAMPUS_LMA, g.Student_Edu_Level_Desc,
               g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth, g.School_Name
      ORDER BY fiscal_year, Campus_State
    ) TO '{DTA_OUTPUT_DIR}/geographic_retention.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: geographic_retention.parquet")

## 5. State Retention Table

Creates a table showing graduate flows from campus states to employer states.

In [None]:
print("Creating state_retention.parquet...")

con.execute(f"""
    COPY (
      WITH grad_cohort AS (
        SELECT DISTINCT
          Year AS fiscal_year,
          UPPER(TRIM(Campus_State)) AS Campus_State,
          TRIM(CAMPUS_LMA) AS CAMPUS_LMA,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          School_Name,
          SEVIS_ID
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE TRY_CAST(Program_End_Date AS DATE)
              BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
          AND Campus_State IS NOT NULL
          AND Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      ),
      
      opt_rows_geo AS (
        SELECT
          SEVIS_ID,
          TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
          UPPER(TRIM(Employer_State)) AS Employer_State,
          TRIM(EMPLOYER_LMA) AS Employer_LMA
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE TRY_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
          AND LOWER(TRIM(Employment_Description)) = 'opt'
          AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
          AND TRY_CAST(Authorization_Start_Date AS DATE) > TRY_CAST(Program_Start_Date AS DATE)
          AND Employer_State IS NOT NULL
          AND TRIM(Employer_State) <> ''
          AND NOT REGEXP_MATCHES(LOWER(TRIM(Employer_State)), '^(n/?a|none|unknown|not applicable|null|)$')
      ),
      
      cohort_opt AS (
        SELECT DISTINCT g.*
        FROM grad_cohort g
        WHERE EXISTS (
          SELECT 1 FROM opt_rows_geo o WHERE o.SEVIS_ID = g.SEVIS_ID
        )
      )
      
      SELECT DISTINCT
        g.fiscal_year,
        g.Campus_State,
        g.CAMPUS_LMA,
        g.Student_Edu_Level_Desc,
        g.IS_STEM,
        g.NSF_SUBJ_FIELD_BROAD,
        g.Country_of_Birth,
        g.School_Name,
        g.SEVIS_ID,
        o.Employer_State,
        o.Employer_LMA
      FROM cohort_opt g
      JOIN opt_rows_geo o ON o.SEVIS_ID = g.SEVIS_ID
    ) TO '{DTA_OUTPUT_DIR}/state_retention.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: state_retention.parquet")

## 6. State Retention Nationally Table

Creates a table showing state-level retention rates across all US states.

In [None]:
print("Creating state_retention_nationally.parquet...")

con.execute(f"""
    COPY (
      WITH grad_cohort_all AS (
        SELECT DISTINCT
          Year AS fiscal_year,
          UPPER(TRIM(Campus_State)) AS Campus_State,
          Student_Edu_Level_Desc,
          IS_STEM,
          NSF_SUBJ_FIELD_BROAD,
          Country_of_Birth,
          SEVIS_ID
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE TRY_CAST(Program_End_Date AS DATE)
              BETWEEN MAKE_DATE(Year - 1, 10, 1) AND MAKE_DATE(Year, 9, 30)
          AND Campus_State IS NOT NULL
          AND Year BETWEEN {YEAR_MIN} AND {YEAR_MAX}
      ),
      
      opt_rows_geo AS (
        SELECT
          SEVIS_ID,
          TRY_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
          UPPER(TRIM(Employer_State)) AS employer_state
        FROM read_parquet('{ENRICHED_MASTER_PATH}')
        WHERE TRY_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
          AND LOWER(TRIM(Employment_Description)) = 'opt'
          AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
          AND TRY_CAST(Authorization_Start_Date AS DATE) > TRY_CAST(Program_Start_Date AS DATE)
          AND Employer_State IS NOT NULL
          AND TRIM(Employer_State) <> ''
          AND NOT REGEXP_MATCHES(LOWER(TRIM(Employer_State)), '^(n/?a|none|unknown|not applicable|null)$')
      ),
      
      grad_cohort_opt_state AS (
        SELECT DISTINCT
          g.fiscal_year, g.Campus_State,
          g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
          g.SEVIS_ID
        FROM grad_cohort_all g
        WHERE EXISTS (
          SELECT 1 FROM opt_rows_geo o WHERE o.SEVIS_ID = g.SEVIS_ID
        )
      ),
      
      denom AS (
        SELECT
          fiscal_year, Campus_State,
          Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth,
          COUNT(DISTINCT SEVIS_ID) AS num_opt_users_with_valid_state
        FROM grad_cohort_opt_state
        GROUP BY fiscal_year, Campus_State, Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth
      ),
      
      numer_same_state AS (
        SELECT
          g.fiscal_year, g.Campus_State,
          g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
          COUNT(DISTINCT g.SEVIS_ID) AS num_worked_in_campus_state
        FROM grad_cohort_opt_state g
        WHERE EXISTS (
          SELECT 1
          FROM opt_rows_geo o
          WHERE o.SEVIS_ID = g.SEVIS_ID
            AND o.employer_state = g.Campus_State
        )
        GROUP BY g.fiscal_year, g.Campus_State, g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth
      )
      
      SELECT
        d.fiscal_year,
        d.Campus_State,
        d.Student_Edu_Level_Desc,
        d.IS_STEM,
        d.NSF_SUBJ_FIELD_BROAD,
        d.Country_of_Birth,
        d.num_opt_users_with_valid_state,
        COALESCE(n.num_worked_in_campus_state, 0) AS num_worked_in_campus_state,
        CAST(n.num_worked_in_campus_state AS DOUBLE) / NULLIF(d.num_opt_users_with_valid_state, 0)
          AS state_retention_among_opt_users_with_valid_state
      FROM denom d
      LEFT JOIN numer_same_state n USING (
        fiscal_year, Campus_State,
        Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth
      )
    ) TO '{DTA_OUTPUT_DIR}/state_retention_nationally.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

print(f"✓ Created: state_retention_nationally.parquet")

## Summary

In [None]:
print("\n" + "="*60)
print("STAGING TABLES CREATION COMPLETE")
print("="*60)
print(f"\nOutput directory: {os.path.abspath(DTA_OUTPUT_DIR)}")
print("\nCreated tables:")

tables = [
    'grad_cohort_opt_plot1.parquet',
    'opt_magnitude_plot2.parquet',
    'status_changes.parquet',
    'geographic_retention.parquet',
    'state_retention.parquet',
    'state_retention_nationally.parquet'
]

for table in tables:
    path = os.path.join(DTA_OUTPUT_DIR, table)
    if os.path.exists(path):
        size = os.path.getsize(path) / (1024**2)  # MB
        print(f"  ✓ {table:40s} {size:>10.2f} MB")

print("\n✓ You can now run the analysis notebooks!")
print("✓ Each notebook will read from the staging tables you just created.")

In [None]:
# Close connection
con.close()
print("\n✓ DuckDB connection closed")