In [None]:
%%bigquery
DROP TABLE IF EXISTS `sevis-beta.sevis_staging.state_retention_nationally_plot8`;
CREATE OR REPLACE TABLE `sevis-beta.sevis_staging.state_retention_nationally_plot8` AS
WITH
-- All grads (by fiscal year from PED window), normalized campus state
# grad_cohort_all AS (
#   SELECT DISTINCT
#     `Year` AS fiscal_year,
#     UPPER(TRIM(Campus_State)) AS Campus_State,
#     -- Add these fields to GROUP BY or use MAX/MIN
#     Student_Edu_Level_Desc,
#     IS_STEM,
#     NSF_SUBJ_FIELD_BROAD,
#     Country_of_Birth,
#     SEVIS_ID
#   FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
#   WHERE SAFE_CAST(Program_End_Date AS DATE)
#         BETWEEN DATE(`Year` - 1, 10, 1) AND DATE(`Year`, 9, 30)
#     AND Campus_State IS NOT NULL
#   -- Ensure true deduplication by grouping
#   GROUP BY 1,2,3,4,5,6,7
# ),

grad_cohort_all AS (
  SELECT DISTINCT
    `Year` AS fiscal_year,
    UPPER(TRIM(Campus_State)) AS Campus_State,
    Student_Edu_Level_Desc,
    IS_STEM,
    NSF_SUBJ_FIELD_BROAD,
    Country_of_Birth,
    SEVIS_ID
  FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
  WHERE SAFE_CAST(Program_End_Date AS DATE)
        BETWEEN DATE(`Year` - 1, 10, 1) AND DATE(`Year`, 9, 30)
    AND Campus_State IS NOT NULL
),

# opt_rows_geo AS (
#   SELECT DISTINCT  -- Add DISTINCT
#     SEVIS_ID,
#     UPPER(TRIM(Employer_State)) AS employer_state
#   FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
#   WHERE SAFE_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
#     AND LOWER(TRIM(Employment_Description)) = 'opt'
#     AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
#     AND Employer_State IS NOT NULL
#     AND TRIM(Employer_State) <> ''
#     AND NOT REGEXP_CONTAINS(LOWER(TRIM(Employer_State)), r'^(?:n/?a|none|unknown|not applicable|null)')
# ),
-- Qualifying OPT rows WITH valid employer state
opt_rows_geo AS (
  SELECT
    SEVIS_ID,
    SAFE_CAST(Authorization_Start_Date AS DATE) AS auth_start_date,
    UPPER(TRIM(Employer_State)) AS employer_state
  FROM `sevis-beta.sevis_raw.sevis_f1_cleaned_master`
  WHERE SAFE_CAST(Authorization_Start_Date AS DATE) IS NOT NULL
    AND LOWER(TRIM(Employment_Description)) = 'opt'
    AND LOWER(TRIM(Employment_OPT_Type)) IN ('post-completion','stem')
    AND SAFE_CAST(Authorization_Start_Date AS DATE) > SAFE_CAST(Program_Start_Date AS DATE)
    AND Employer_State IS NOT NULL
    AND TRIM(Employer_State) <> ''
    AND NOT REGEXP_CONTAINS(LOWER(TRIM(Employer_State)),
                            r'^(?:n/?a|none|unknown|not applicable|null)$')
),

-- Cohort: grads who used qualifying OPT AND have a valid employer state (ever)
grad_cohort_opt_state AS (
  SELECT DISTINCT
    g.fiscal_year, g.Campus_State,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
    g.SEVIS_ID
  FROM grad_cohort_all g
  WHERE EXISTS (
    SELECT 1 FROM opt_rows_geo o WHERE o.SEVIS_ID = g.SEVIS_ID
  )
),

-- Denominator per state slice (distinct students)
denom AS (
  SELECT
    fiscal_year, Campus_State,
    Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth,
    COUNT(DISTINCT SEVIS_ID) AS num_opt_users_with_valid_state
  FROM grad_cohort_opt_state
  GROUP BY 1,2,3,4,5,6
),

-- Numerator per state slice: among those, who EVER worked in the campus state
numer_same_state AS (
  SELECT
    g.fiscal_year, g.Campus_State,
    g.Student_Edu_Level_Desc, g.IS_STEM, g.NSF_SUBJ_FIELD_BROAD, g.Country_of_Birth,
    COUNT(DISTINCT g.SEVIS_ID) AS num_worked_in_campus_state
  FROM grad_cohort_opt_state g
  WHERE EXISTS (
    SELECT 1
    FROM opt_rows_geo o
    WHERE o.SEVIS_ID = g.SEVIS_ID
      AND o.employer_state = g.Campus_State  -- same-state
  )
  GROUP BY 1,2,3,4,5,6
)

-- Persisted state-slice table (no school/LMA)
SELECT
  d.fiscal_year,
  d.Campus_State,
  d.Student_Edu_Level_Desc,
  d.IS_STEM,
  d.NSF_SUBJ_FIELD_BROAD,
  d.Country_of_Birth,
  d.num_opt_users_with_valid_state,
  COALESCE(n.num_worked_in_campus_state, 0) AS num_worked_in_campus_state,
  SAFE_DIVIDE(n.num_worked_in_campus_state, d.num_opt_users_with_valid_state)
    AS state_retention_among_opt_users_with_valid_state
FROM denom d
LEFT JOIN numer_same_state n USING (
  fiscal_year, Campus_State,
  Student_Edu_Level_Desc, IS_STEM, NSF_SUBJ_FIELD_BROAD, Country_of_Birth
);

Query is running:   0%|          |

In [None]:
%%bigquery
-- Parameters (NULL means “no filter”)
DECLARE fys ARRAY<INT64> DEFAULT NULL;         -- e.g. [2018,2019,2020]
DECLARE birth_countries ARRAY<STRING> DEFAULT NULL;
DECLARE edu_levels ARRAY<STRING> DEFAULT NULL; -- e.g. ['masters','doctorate']
DECLARE stem_param BOOL DEFAULT NULL;             -- TRUE / FALSE / NULL
DECLARE nsf_fields ARRAY<STRING> DEFAULT NULL; -- e.g. ['Engineering']

--not available in site filters, only for testiing--
DECLARE study_state ARRAY<STRING> DEFAULT NULL;
SET study_state = ['district of columbia'];

SET fys = [2015];--[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
SET birth_countries = NULL;#['china'];
SET edu_levels      = NULL; #['doctorate'];
SET stem_param         = NULL; #TRUE;
SET nsf_fields      = NULL;

WITH filtered AS (
  SELECT
    fiscal_year,
    Campus_State AS state,
    CAST(num_worked_in_campus_state     AS INT64) AS numer_same_state,
    CAST(num_opt_users_with_valid_state AS INT64) AS denom_opt_users
  FROM `sevis-beta.sevis_staging.state_retention_nationally_plot8`
  WHERE (fys IS NULL OR fiscal_year IN UNNEST(fys))
    AND (edu_levels      IS NULL OR Student_Edu_Level_Desc IN UNNEST(edu_levels))
    AND (stem_param         IS NULL OR IS_STEM = stem_param)
    AND (nsf_fields      IS NULL OR NSF_SUBJ_FIELD_BROAD IN UNNEST(nsf_fields))
    AND (birth_countries IS NULL OR Country_of_Birth IN UNNEST(birth_countries))
    AND (study_state IS NULL OR LOWER(TRIM(Campus_State)) IN UNNEST(study_state))
),
agg AS (
  SELECT
    state,
    fiscal_year,
    SUM(numer_same_state) AS numer_total,
    SUM(denom_opt_users)  AS denom_total
  FROM filtered
  GROUP BY state, fiscal_year
)
SELECT
  state,
  fiscal_year,
  numer_total,
  denom_total,
  SAFE_DIVIDE(numer_total, NULLIF(denom_total, 0)) AS retention_share_across_years,
  -- Optional: how many fiscal years are present for this state within the filters
  COUNT(DISTINCT fiscal_year) OVER (PARTITION BY state) AS n_years_covered
FROM agg
ORDER BY state, fiscal_year;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,state,fiscal_year,numer_total,denom_total,retention_share_across_years,n_years_covered
0,DISTRICT OF COLUMBIA,2010,311,688,0.452035,13
1,DISTRICT OF COLUMBIA,2011,322,675,0.477037,13
2,DISTRICT OF COLUMBIA,2012,395,862,0.458237,13
3,DISTRICT OF COLUMBIA,2013,412,1006,0.409543,13
4,DISTRICT OF COLUMBIA,2014,478,1279,0.373729,13
5,DISTRICT OF COLUMBIA,2015,509,1348,0.377596,13
6,DISTRICT OF COLUMBIA,2016,687,1502,0.45739,13
7,DISTRICT OF COLUMBIA,2017,732,1587,0.461248,13
8,DISTRICT OF COLUMBIA,2018,647,1615,0.400619,13
9,DISTRICT OF COLUMBIA,2019,651,1638,0.397436,13
