In [None]:
%%bigquery
DROP TABLE IF EXISTS `sevis-beta.sevis_staging.state_retention_plot7`;
CREATE OR REPLACE TABLE `sevis-beta.sevis_staging.state_retention_plot7` AS
WITH grad_cohort AS (
  SELECT DISTINCT
    `Year` AS fiscal_year,
    UPPER(TRIM(Campus_State)) AS Campus_State,
    TRIM(CAMPUS_LMA)          AS CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    SAFE_CAST(Program_End_Date AS DATE) AS ped
  FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
  WHERE SAFE_CAST(Program_End_Date AS DATE)
        BETWEEN DATE(`Year` - 1, 10, 1) AND DATE(`Year`, 9, 30)
    AND Campus_State IS NOT NULL
),
opt_rows_geo AS (
  SELECT
    SEVIS_ID,
    SAFE_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    UPPER(TRIM(Employer_State)) AS Employer_State,
    TRIM(EMPLOYER_LMA)          AS Employer_LMA
  FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
  WHERE SAFE_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
    AND LOWER(TRIM(Employment_Description)) = 'opt'
    AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
    AND SAFE_CAST(Authorization_Start_Date AS DATE) > SAFE_CAST(Program_Start_Date AS DATE)
    AND Employer_State IS NOT NULL
    AND TRIM(Employer_State) <> ''
    AND NOT REGEXP_CONTAINS(LOWER(TRIM(Employer_State)),
                            r'^(?:n/?a|none|unknown|not applicable|null|)$')
),
cohort_opt AS (
  SELECT DISTINCT g.*
  FROM grad_cohort g
  WHERE EXISTS (
    SELECT 1 FROM opt_rows_geo o
    WHERE o.SEVIS_ID = g.SEVIS_ID
          -- AND o.auth_start_date BETWEEN g.ped AND DATE_ADD(g.ped, INTERVAL 36 MONTH)
  )
),
flows_person AS (
  SELECT DISTINCT
    g.fiscal_year,
    g.Campus_State,
    g.CAMPUS_LMA,
    g.Student_Edu_Level_Desc,
    g.IS_STEM,
    g.NSF_SUBJ_FIELD_BROAD,
    g.Country_of_Birth,
    g.School_Name,
    g.SEVIS_ID,
    o.Employer_State,
    o.Employer_LMA
  FROM cohort_opt g
  JOIN opt_rows_geo o
    ON o.SEVIS_ID = g.SEVIS_ID
    /* AND o.auth_start_date BETWEEN g.ped AND DATE_ADD(g.ped, INTERVAL 36 MONTH) */
)
SELECT * FROM flows_person;

Query is running:   0%|          |

In [None]:
%%bigquery
-- REQUIRE THAT ONE (AND ONLY ONE) OF EITHER STUDY LMA OR STUDY STATE BE SET --
DECLARE target_study_state STRING DEFAULT 'nebraska';
DECLARE target_study_lma STRING;

DECLARE fys ARRAY<INT64>;
DECLARE birth_countries    ARRAY<STRING>;
DECLARE edu_levels  ARRAY<STRING>;
DECLARE stem_param     BOOL;
DECLARE nsf_fields   ARRAY<STRING>;
DECLARE schools   ARRAY<STRING>;

SET target_study_state = 'california';--'nebraska';--['alabama','alaska','arizona','arkansas','california','colorado','connecticut','delaware','district of columbia','florida','georgia','guam','hawaii','idaho','illinois','indiana','iowa','kansas','kentucky','louisiana','maine','maryland','massachusetts','michigan','minnesota','mississippi','missouri','montana','nebraska','nevada','new hampshire','new jersey','new mexico','new york','north carolina','north dakota','northern mariana islands','ohio','oklahoma','oregon','pennsylvania','puerto rico','rhode island','south carolina','south dakota','tennessee','texas','utah','vermont','virgin islands of the us','virginia','washington','west virginia','wisconsin','wyoming'];--['california'];
SET target_study_lma = NULL;--'Chico, CA Metropolitan Statistical Area';

SET fys    = [2019];--[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
SET birth_countries    = NULL;
SET edu_levels  = NULL;
SET stem_param     = NULL;
SET nsf_fields = NULL;
SET schools = NULL;

-- ── 3) Validate: exactly one of state or LMA must be provided (XOR) ─
ASSERT ( (target_study_state IS NOT NULL) <> (target_study_lma IS NOT NULL) )
  AS 'Exactly one of target_study_state or target_study_lma must be set (and the other must be NULL).';

-- Plot7: for the selected study geo (STATE or LMA), show share of grads who ever used OPT in each employer state
WITH base AS (
  SELECT
    fiscal_year,
    UPPER(Campus_State) AS Campus_State,
    CAMPUS_LMA,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    School_Name,
    SEVIS_ID,
    UPPER(Employer_State) AS Employer_State
  FROM `sevis-beta.sevis_staging.state_retention_plot7`
  WHERE
        ( (target_study_state IS NOT NULL AND LOWER(Campus_State) = LOWER(target_study_state))
       OR (target_study_lma   IS NOT NULL AND LOWER(CAMPUS_LMA)   = LOWER(target_study_lma)) )
    AND (fys             IS NULL OR fiscal_year             IN UNNEST(fys))
    AND (edu_levels      IS NULL OR Student_Edu_Level_Desc  IN UNNEST(edu_levels))
    AND (stem_param         IS NULL OR IS_STEM = stem_param)
    AND (nsf_fields      IS NULL OR NSF_SUBJ_FIELD_BROAD    IN UNNEST(nsf_fields))
    AND (birth_countries IS NULL OR Country_of_Birth        IN UNNEST(birth_countries))
    AND (schools         IS NULL OR School_Name             IN UNNEST(schools))
),

-- Pooled denominator across years: number of (year, person) pairs in the selected study geo
denom AS (
  SELECT COUNT(*) AS denom_total
  FROM ( SELECT DISTINCT fiscal_year, SEVIS_ID FROM base )
),

-- Build distinct (year, person, employer_state) destination pairs
flows_pairs AS (
  SELECT DISTINCT fiscal_year, SEVIS_ID, Employer_State, Campus_State
  FROM base
  WHERE Employer_State IS NOT NULL
),

-- Optionally drop same-state destinations
flows_pairs_filtered AS (
  SELECT fiscal_year, SEVIS_ID, Employer_State
  FROM flows_pairs
  # WHERE Employer_State <> Campus_State
),

-- Pooled numerators across years: count (year, person) who ever used OPT in each employer state
flows_by_state AS (
  SELECT
    Employer_State AS employer_state,
    COUNT(*) AS numer_total
  FROM flows_pairs_filtered
  GROUP BY employer_state
)

SELECT
  CASE
    WHEN target_study_state IS NOT NULL THEN CONCAT('STATE: ', UPPER(target_study_state))
    ELSE CONCAT('LMA: ', UPPER(target_study_lma))
  END AS study_geography,
  employer_state,
  d.denom_total                             AS total_grads_A,   -- pooled across selected years (total graduates to opt, technically)
  f.numer_total                             AS grads_to_state_B, -- pooled across selected years
  SAFE_DIVIDE(f.numer_total, NULLIF(d.denom_total, 0))
    AS share_B_over_A_to_employer_state
FROM flows_by_state f
CROSS JOIN denom d
ORDER BY share_B_over_A_to_employer_state DESC NULLS LAST, employer_state;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,study_geography,employer_state,total_grads_A,grads_to_state_B,share_B_over_A_to_employer_state
0,STATE: CALIFORNIA,CALIFORNIA,17763,14676,0.826212
1,STATE: CALIFORNIA,WASHINGTON,17763,1049,0.059055
2,STATE: CALIFORNIA,NEW YORK,17763,1034,0.058211
3,STATE: CALIFORNIA,TEXAS,17763,542,0.030513
4,STATE: CALIFORNIA,NEW JERSEY,17763,377,0.021224
5,STATE: CALIFORNIA,MASSACHUSETTS,17763,362,0.020379
6,STATE: CALIFORNIA,DELAWARE,17763,250,0.014074
7,STATE: CALIFORNIA,ILLINOIS,17763,221,0.012442
8,STATE: CALIFORNIA,VIRGINIA,17763,213,0.011991
9,STATE: CALIFORNIA,GEORGIA,17763,188,0.010584


# testing below here

In [None]:
# %%bigquery
# -- ── PARAMETERS ──────────────────────────────────────────────────────
# DECLARE fys             ARRAY<INT64>  DEFAULT [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
# DECLARE birth_countries ARRAY<STRING> DEFAULT NULL;
# DECLARE edu_levels      ARRAY<STRING> DEFAULT NULL;
# DECLARE stem_param         BOOL          DEFAULT NULL;
# DECLARE nsf_fields      ARRAY<STRING> DEFAULT NULL;
# DECLARE schools         ARRAY<STRING> DEFAULT NULL;

# -- Optional: restrict to a subset of study states (NULL = all)
# DECLARE study_states    ARRAY<STRING> DEFAULT NULL;  -- e.g., ['california','texas'] (lowercase); NULL => all states

# -- Toggles
# DECLARE include_same_state BOOL DEFAULT TRUE;  -- FALSE to drop same-state flows
# DECLARE opt_window_months  INT64 DEFAULT NULL;   -- NULL to disable timing window

# -- ── COHORTS ─────────────────────────────────────────────────────────
# WITH grad_cohort AS (
#   SELECT DISTINCT
#     `Year` AS fiscal_year,
#     UPPER(TRIM(Campus_State)) AS Campus_State,
#     TRIM(CAMPUS_LMA)          AS CAMPUS_LMA,
#     Student_Edu_Level_Desc,
#     IS_STEM,
#     NSF_SUBJ_FIELD_BROAD,
#     Country_of_Birth,
#     School_Name,
#     SEVIS_ID,
#     SAFE_CAST(Program_End_Date AS DATE) AS ped
#   FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
#   WHERE SAFE_CAST(Program_End_Date AS DATE)
#         BETWEEN DATE(`Year` - 1, 10, 1) AND DATE(`Year`, 9, 30)
#     AND Campus_State IS NOT NULL
# ),
# opt_rows_geo AS (
#   SELECT
#     SEVIS_ID,
#     SAFE_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
#     UPPER(TRIM(Employer_State)) AS Employer_State,
#     TRIM(EMPLOYER_LMA)          AS Employer_LMA
#   FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
#   WHERE SAFE_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
#     AND LOWER(TRIM(Employment_Description)) = 'opt'
#     AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
#     AND Employer_State IS NOT NULL
#     AND TRIM(Employer_State) <> ''
#     AND NOT REGEXP_CONTAINS(LOWER(TRIM(Employer_State)),
#                             r'^(?:n/?a|none|unknown|not applicable|null)$')
# ),
# cohort_opt AS (
#   -- keep only grads who go on to use OPT (optionally within N months of PED)
#   SELECT DISTINCT g.*
#   FROM grad_cohort g
#   WHERE EXISTS (
#     SELECT 1
#     FROM opt_rows_geo o
#     WHERE o.SEVIS_ID = g.SEVIS_ID
#       AND (
#             opt_window_months IS NULL
#             OR (o.auth_start_date BETWEEN g.ped AND DATE_ADD(g.ped, INTERVAL opt_window_months MONTH))
#           )
#   )
# ),
# flows_person AS (
#   -- OPT user × destination rows (per year-person-destination)
#   SELECT DISTINCT
#     g.fiscal_year,
#     UPPER(g.Campus_State) AS Campus_State,
#     g.CAMPUS_LMA,
#     g.Student_Edu_Level_Desc,
#     g.IS_STEM,
#     g.NSF_SUBJ_FIELD_BROAD,
#     g.Country_of_Birth,
#     g.School_Name,
#     g.SEVIS_ID,
#     UPPER(o.Employer_State) AS Employer_State,
#     o.Employer_LMA
#   FROM cohort_opt g
#   JOIN opt_rows_geo o
#     ON o.SEVIS_ID = g.SEVIS_ID
#    AND (
#          opt_window_months IS NULL
#          OR (o.auth_start_date BETWEEN g.ped AND DATE_ADD(g.ped, INTERVAL opt_window_months MONTH))
#        )
# ),

# -- ── FILTERS (apply to ALL states at once) ───────────────────────────
# base AS (
#   SELECT
#     fiscal_year,
#     Campus_State,
#     CAMPUS_LMA,
#     Student_Edu_Level_Desc,
#     IS_STEM,
#     NSF_SUBJ_FIELD_BROAD,
#     Country_of_Birth,
#     School_Name,
#     SEVIS_ID,
#     Employer_State
#   FROM flows_person
#   WHERE (study_states IS NULL OR LOWER(Campus_State) IN UNNEST(study_states))
#     AND (fys             IS NULL OR fiscal_year             IN UNNEST(fys))
#     AND (edu_levels      IS NULL OR Student_Edu_Level_Desc  IN UNNEST(edu_levels))
#     AND (is_stem         IS NULL OR IS_STEM = is_stem)
#     AND (nsf_fields      IS NULL OR NSF_SUBJ_FIELD_BROAD    IN UNNEST(nsf_fields))
#     AND (birth_countries IS NULL OR Country_of_Birth        IN UNNEST(birth_countries))
#     AND (schools         IS NULL OR School_Name             IN UNNEST(schools))
# ),

# -- Denominator: per Campus_State (distinct year-person among OPT users)
# denom_by_state AS (
#   SELECT
#     Campus_State,
#     COUNT(*) AS denom_total
#   FROM (
#     SELECT DISTINCT Campus_State, fiscal_year, SEVIS_ID
#     FROM base
#   )
#   GROUP BY Campus_State
# ),

# -- Numerator: per (Campus_State × Employer_State)
# flows_pairs AS (
#   SELECT DISTINCT fiscal_year, SEVIS_ID, Employer_State, Campus_State
#   FROM base
#   WHERE Employer_State IS NOT NULL
#     AND ( include_same_state OR Employer_State <> Campus_State )
# ),
# flows_by_state AS (
#   SELECT
#     Campus_State,
#     Employer_State AS employer_state,
#     COUNT(*)       AS numer_total
#   FROM flows_pairs
#   GROUP BY Campus_State, employer_state
# )

# -- ── FINAL OUTPUT: one row per Campus_State × Employer_State ─────────
# SELECT
#   CONCAT('STATE: ', Campus_State)           AS study_geography,
#   employer_state,
#   d.denom_total                             AS total_opt_users_in_study_geo,
#   f.numer_total                             AS opt_users_worked_in_state,
#   SAFE_DIVIDE(f.numer_total, NULLIF(d.denom_total, 0))
#     AS share_of_opt_users_working_in_state
# FROM flows_by_state f
# JOIN denom_by_state d USING (Campus_State)
# ORDER BY study_geography, share_of_opt_users_working_in_state DESC NULLS LAST, employer_state;
