# Geographic Retention Analysis - Plot 5

This notebook creates a comprehensive geographic retention analysis system for F-1 international students transitioning to OPT employment. The analysis tracks three levels of retention: national (OPT participation), state-level, and metropolitan area-level.

## System Architecture

### **3-Step Process:**
1. **Raw Metrics Table**: Creates foundational A-F metrics with demographic breakdown
2. **Computed Lines Table**: Calculates the 3 retention rate lines with full demographics  
3. **Parameterized Query**: User-friendly filtering for specific populations

### **Retention Rate Formulas:**
- **Line 1 (National)**: `C/D` = Share of graduates who do any OPT
- **Line 2 (State)**: `(A/B) × (C/D)` = Share retained in campus state  
- **Line 3 (Metro)**: `(E/F) × (C/D)` = Share retained in campus metro area

### **Key Features:**
- **Time-Aware Geographic Mapping**: ZIP → County FIPS → LMA using HUD crosswalk data
- **Demographic Filtering**: By country, STEM status, field of study, degree level
- **Data Quality Controls**: Excludes territories, handles missing data, filters broken dates
- **Person-Level Deduplication**: Uses MAX() aggregation to handle multiple OPT records

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `sevis-beta.sevis_staging.geographic_retention_raw_metrics` AS
WITH
us_states AS (
  SELECT state FROM UNNEST([
    'alabama','alaska','arizona','arkansas','california','colorado','connecticut','delaware',
    'florida','georgia','hawaii','idaho','illinois','indiana','iowa','kansas','kentucky',
    'louisiana','maine','maryland','massachusetts','michigan','minnesota','mississippi',
    'missouri','montana','nebraska','nevada','new hampshire','new jersey','new mexico',
    'new york','north carolina','north dakota','ohio','oklahoma','oregon','pennsylvania',
    'rhode island','south carolina','south dakota','tennessee','texas','utah','vermont',
    'virginia','washington','west virginia','wisconsin','wyoming','district of columbia'
  ]) AS state
),

base AS (
  SELECT
    `Year`                                   AS fiscal_year,
    DATE(Year - 1, 10, 1)                    AS fy_start,
    DATE(Year    ,  9, 30)                    AS fy_end,

    Campus_State,
    CAMPUS_LMA,
    Employer_State,
    EMPLOYER_LMA,

    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,

    Employment_Description,
    Employment_OPT_Type,

    SAFE_CAST(Program_End_Date         AS DATE) AS ped,
    SAFE_CAST(Program_Start_Date         AS DATE) AS psd,
    SAFE_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,

    SEVIS_ID
  FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
  WHERE Program_End_Date IS NOT NULL
    AND EXTRACT(YEAR FROM SAFE_CAST(Program_End_Date AS DATE)) <= 2030
),

-- Grad cohort within the FY window, restricted to US states
graduation_cohorts AS (
  SELECT *
  FROM base
  WHERE ped BETWEEN fy_start AND fy_end
    AND Campus_State IN (SELECT state FROM us_states)
),

-- OPT participants -- all the opt records of anyone who ever participated in opt
opt_participants AS (
  SELECT *
  FROM base
  WHERE employment_description = 'opt'
    AND employment_opt_type IN ('post-completion','stem')
    AND auth_start_date IS NOT NULL
    AND auth_start_date > psd
),

-- Join and compute flags
cohort_opt_joined AS (
  SELECT
    g.fiscal_year,
    g.Campus_State,
    g.CAMPUS_LMA,
    g.Student_Edu_Level_Desc,
    g.IS_STEM,
    g.NSF_SUBJ_FIELD_BROAD,
    g.Country_of_Birth,
    g.School_Name,
    g.SEVIS_ID,

    o.EMPLOYER_LMA,

    CASE WHEN o.auth_start_date IS NOT NULL THEN 1 ELSE 0 END AS did_opt,
    CASE WHEN o.employer_state IS NOT NULL THEN 1 ELSE 0 END AS has_employer_state,
    CASE WHEN o.employer_lma   IS NOT NULL THEN 1 ELSE 0 END AS has_employer_lma,
    CASE WHEN o.employer_state IS NOT NULL
           AND g.campus_state  IS NOT NULL
           AND o.employer_state = g.campus_state
         THEN 1 ELSE 0 END AS same_state,
    CASE WHEN o.employer_lma IS NOT NULL
           AND g.campus_lma  IS NOT NULL
           AND o.employer_lma = g.campus_lma
         THEN 1 ELSE 0 END AS same_lma
  FROM graduation_cohorts g
  LEFT JOIN opt_participants o USING (SEVIS_ID)
),

-- Person-level collapse within FY
person_level AS (
  SELECT
    fiscal_year,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    MAX(did_opt)            AS did_opt,
    MAX(has_employer_state) AS has_employer_state,
    MAX(has_employer_lma)   AS has_employer_lma,
    MAX(same_state)         AS same_state,
    MAX(same_lma)           AS same_lma
  FROM cohort_opt_joined
  GROUP BY
    fiscal_year,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID
)

SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,
  COUNT(DISTINCT SEVIS_ID)                                                  AS D_total_grads,
  COUNT(DISTINCT IF(did_opt = 1, SEVIS_ID, NULL))                           AS C_opt_any,
  COUNT(DISTINCT IF(did_opt = 1 AND has_employer_state = 1, SEVIS_ID, NULL)) AS B_opt_emp_state_valid,
  COUNT(DISTINCT IF(did_opt = 1 AND has_employer_lma   = 1, SEVIS_ID, NULL)) AS F_opt_emp_lma_valid,
  COUNT(DISTINCT IF(same_state = 1, SEVIS_ID, NULL))                         AS A_same_state,
  COUNT(DISTINCT IF(same_lma   = 1, SEVIS_ID, NULL))                         AS E_same_lma
FROM person_level
GROUP BY
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name
# HAVING D_total_grads > 0 -- we may want to see when / if there are no grads for a sub-population
ORDER BY fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name;


Query is running:   0%|          |

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `sevis-beta.sevis_staging.geographic_retention_plot5` AS
WITH raw_metrics AS (
  SELECT * FROM `sevis-beta.sevis_staging.geographic_retention_raw_metrics`
),
demographic_aggregates AS (
  SELECT
    fiscal_year,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SUM(D_total_grads)         AS D_total_grads,
    SUM(C_opt_any)             AS C_opt_any,
    SUM(B_opt_emp_state_valid) AS B_opt_emp_state_valid,
    SUM(F_opt_emp_lma_valid)   AS F_opt_emp_lma_valid,
    SUM(A_same_state)          AS A_same_state,
    SUM(E_same_lma)            AS E_same_lma
  FROM raw_metrics
  GROUP BY
    fiscal_year,
    Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name
)
SELECT
  fiscal_year,
  Campus_State,
  CAMPUS_LMA,
  Student_Edu_Level_Desc,
  IS_STEM,
  NSF_SUBJ_FIELD_BROAD,
  Country_of_Birth,
  School_Name,

  SAFE_DIVIDE(C_opt_any, D_total_grads) AS line1_retained_nationally,

  CASE
    WHEN B_opt_emp_state_valid > 0 THEN
      SAFE_DIVIDE(A_same_state, B_opt_emp_state_valid) * SAFE_DIVIDE(C_opt_any, D_total_grads)
    ELSE 0
  END AS line2_retained_in_state,

  CASE
    WHEN F_opt_emp_lma_valid > 0 THEN
      SAFE_DIVIDE(E_same_lma, F_opt_emp_lma_valid) * SAFE_DIVIDE(C_opt_any, D_total_grads)
    ELSE 0
  END AS line3_retained_in_lma,

  -- Supporting
  D_total_grads,
  C_opt_any,
  A_same_state,
  B_opt_emp_state_valid,
  E_same_lma,
  F_opt_emp_lma_valid,

  -- Optional diagnostics
  SAFE_DIVIDE(C_opt_any, D_total_grads) AS opt_rate,
  SAFE_DIVIDE(A_same_state, C_opt_any)  AS state_retention_among_opt_users,
  SAFE_DIVIDE(E_same_lma, C_opt_any)    AS lma_retention_among_opt_users
FROM demographic_aggregates
# WHERE D_total_grads > 0 -- maybe we want to see when sub-populations don't have any opt participants among their grads?
ORDER BY fiscal_year, Campus_State, CAMPUS_LMA, Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth, School_Name;


Query is running:   0%|          |

In [None]:
%%bigquery
DECLARE fys ARRAY<INT64> DEFAULT [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
DECLARE birth_countries    ARRAY<STRING>;
DECLARE campus_states ARRAY<STRING>;
DECLARE campus_lmas ARRAY<STRING>;
DECLARE edu_levels  ARRAY<STRING>;
DECLARE is_stem     BOOL;
DECLARE nsf_fields   ARRAY<STRING>;
DECLARE schools   ARRAY<STRING>;

SET fys    = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
SET birth_countries    = NULL;
SET campus_states = NULL;--['alabama','alaska','arizona','arkansas','california','colorado','connecticut','delaware','district of columbia','florida','georgia','guam','hawaii','idaho','illinois','indiana','iowa','kansas','kentucky','louisiana','maine','maryland','massachusetts','michigan','minnesota','mississippi','missouri','montana','nebraska','nevada','new hampshire','new jersey','new mexico','new york','north carolina','north dakota','northern mariana islands','ohio','oklahoma','oregon','pennsylvania','puerto rico','rhode island','south carolina','south dakota','tennessee','texas','utah','vermont','virgin islands of the us','virginia','washington','west virginia','wisconsin','wyoming'];--['california'];
SET campus_lmas = NULL;--['Washington-Arlington-Alexandria, DC-VA-MD-WV Metropolitan Statistical Area'];
# ['New York-Newark-Jersey City, NY-NJ Metropolitan Statistical Area',
#                     'Los Angeles-Long Beach-Anaheim, CA Metropolitan Statistical Area',
#                     'Chicago-Naperville-Elgin, IL-IN Metropolitan Statistical Area',
#                     'San Francisco-Oakland-Fremont, CA Metropolitan Statistical Area',
#                     'Washington-Arlington-Alexandria, DC-VA-MD-WV Metropolitan Statistical Area',
#                     'Dallas-Fort Worth-Arlington, TX Metropolitan Statistical Area',
#                     'Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Metropolitan Statistical Area',
#                     'Miami-Fort Lauderdale-West Palm Beach, FL Metropolitan Statistical Area',
#                     'Atlanta-Sandy Springs-Roswell, GA Metropolitan Statistical Area',
#                     'San Jose-Sunnyvale-Santa Clara, CA Metropolitan Statistical Area',
#                     'San Diego-Chula Vista-Carlsbad, CA Metropolitan Statistical Area'
#                     ];

--NULL;--['San Francisco-Oakland-Fremont, CA Metropolitan Statistical Area'];--Chico, CA Metropolitan Statistical Area
SET edu_levels  = NULL;--['doctorate'];
SET is_stem     = NULL;
SET nsf_fields = NULL;--['Engineering']
SET schools = ['a f international school of languages inc'];--['athenaeum of ohio']--['loras college'];

WITH years AS (
  SELECT fiscal_year
  FROM UNNEST(
    CASE
      WHEN fys IS NOT NULL THEN fys
      ELSE GENERATE_ARRAY(2010, 2022)
    END
  ) AS fiscal_year
),
agg AS (
  SELECT
    fiscal_year,
    SUM(A_same_state)            AS A_same_state,
    SUM(B_opt_emp_state_valid)   AS B_opt_emp_state_valid,
    SUM(E_same_lma)              AS E_same_lma,
    SUM(F_opt_emp_lma_valid)     AS F_opt_emp_lma_valid,
    SUM(C_opt_any)               AS C_opt_any,
    SUM(D_total_grads)           AS D_total_grads
  FROM `sevis-beta.sevis_staging.geographic_retention_plot5`
  WHERE
        (fys IS NULL OR fiscal_year IN UNNEST(fys))
    AND (campus_states   IS NULL OR Campus_State            IN UNNEST(campus_states))
    AND (campus_lmas     IS NULL OR CAMPUS_LMA              IN UNNEST(campus_lmas))
    AND (edu_levels      IS NULL OR Student_Edu_Level_Desc  IN UNNEST(edu_levels))
    AND (is_stem         IS NULL OR IS_STEM                 =  is_stem)
    AND (birth_countries IS NULL OR Country_of_Birth        IN UNNEST(birth_countries))
    AND (nsf_fields      IS NULL OR NSF_SUBJ_FIELD_BROAD    IN UNNEST(nsf_fields))
    AND (schools         IS NULL OR School_Name             IN UNNEST(schools))
  GROUP BY fiscal_year
),

filled AS (
  SELECT
    y.fiscal_year,
    COALESCE(a.A_same_state,          0) AS A_same_state,
    COALESCE(a.B_opt_emp_state_valid, 0) AS B_opt_emp_state_valid,
    COALESCE(a.E_same_lma,            0) AS E_same_lma,
    COALESCE(a.F_opt_emp_lma_valid,   0) AS F_opt_emp_lma_valid,
    COALESCE(a.C_opt_any,             0) AS C_opt_any,
    COALESCE(a.D_total_grads,         0) AS D_total_grads
  FROM years y
  LEFT JOIN agg a USING (fiscal_year)
)

SELECT
  fiscal_year,
  SAFE_DIVIDE(C_opt_any, D_total_grads) AS line1_retained_nationally,
  CASE
    WHEN IFNULL(SAFE_DIVIDE(C_opt_any, D_total_grads), 0) = 0 THEN 0
    ELSE SAFE_DIVIDE(A_same_state, B_opt_emp_state_valid)
         * SAFE_DIVIDE(C_opt_any, D_total_grads)
  END AS line2_retained_in_state,
  CASE
    WHEN IFNULL(SAFE_DIVIDE(C_opt_any, D_total_grads), 0) = 0 THEN 0
    ELSE SAFE_DIVIDE(E_same_lma, F_opt_emp_lma_valid)
         * SAFE_DIVIDE(C_opt_any, D_total_grads)
  END AS line3_retained_in_lma
FROM filled
ORDER BY fiscal_year;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fiscal_year,line1_retained_nationally,line2_retained_in_state,line3_retained_in_lma
0,2010,,0.0,0.0
1,2011,,0.0,0.0
2,2012,,0.0,0.0
3,2013,,0.0,0.0
4,2014,,0.0,0.0
5,2015,,0.0,0.0
6,2016,0.0,0.0,0.0
7,2017,0.0,0.0,0.0
8,2018,,0.0,0.0
9,2019,,0.0,0.0


In [None]:
# %%bigquery
# -- ================================================================================
# -- ADDITIONAL UTILITY QUERY: GET AVAILABLE LMAs FOR A STATE
# -- ================================================================================
# -- Use this query to see what LMA values are available for filtering
# -- Execute separately to explore available metro areas

# -- Example: Get all LMAs available for California
# SELECT DISTINCT
#   CAMPUS_LMA,
#   COUNT(DISTINCT fiscal_year) as years_with_data,
#   SUM(D_total_grads) as total_graduates,
#   MIN(fiscal_year) as first_year,
#   MAX(fiscal_year) as last_year
# FROM `sevis-beta.sevis_staging.geographic_retention_plot5`
# #WHERE Campus_State = 'california'
# WHERE CAMPUS_LMA != 'missing data'
#   AND CAMPUS_LMA IS NOT NULL
# GROUP BY CAMPUS_LMA
# ORDER BY total_graduates DESC;