In [None]:
# %%bigquery
# -- New table with only the 3 degree levels
# CREATE OR REPLACE TABLE `sevis-beta.sevis_staging.subject_fields_plot3` AS
# SELECT *
# FROM `sevis-beta.sevis_raw.ipeds_raw_data_subset_plot3`
# WHERE LOWER(TRIM(SEVIS_mapping)) IN ('bachelors','masters','doctorate')
#   AND `YEAR` IN (2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022);

: 

In [None]:
%%sql
DECLARE ipeds_years          ARRAY<INT64>  DEFAULT NULL;
DECLARE schools        ARRAY<STRING> DEFAULT NULL;
DECLARE degree_levels  ARRAY<STRING> DEFAULT NULL;

SET ipeds_years   = NULL;--[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022];
SET schools       = ['University of California-Berkeley'];
SET degree_levels = NULL;--['doctorate'];--['doctorate']; --(NULL by default includes 'bachelors','masters','doctorate' rows only due to the way the table is setup (see cell above)

WITH cleaned AS (
SELECT
    `YEAR` AS ipeds_academic_year,
    INSTITUTION_NAME,
    MAJORNUM,
    SEVIS_mapping,
    NSF_SUBJ_FIELD_BROAD,
    -- robust parse: keep digits only; X/blank -> NULL
    SAFE_CAST(NULLIF(REGEXP_REPLACE(TRIM(NONRES_TOTAL), r'[^0-9]', ''), '') AS INT64) AS nonres_int,
    SAFE_CAST(NULLIF(REGEXP_REPLACE(TRIM(GRAND_TOTAL),  r'[^0-9]', ''), '') AS INT64) AS grand_int
FROM `sevis-beta.sevis_staging.subject_fields_plot3`
WHERE
  (ipeds_years         IS NULL OR ARRAY_LENGTH(ipeds_years)         = 0 OR `YEAR`               IN UNNEST(ipeds_years))
    AND (schools       IS NULL OR ARRAY_LENGTH(schools)       = 0 OR INSTITUTION_NAME     IN UNNEST(schools))
    AND (degree_levels IS NULL OR ARRAY_LENGTH(degree_levels) = 0 OR SEVIS_mapping              IN UNNEST(degree_levels))
)
SELECT
  ipeds_academic_year,
  NSF_SUBJ_FIELD_BROAD,
  SAFE_DIVIDE(SUM(nonres_int), NULLIF(SUM(grand_int), 0)) AS nonresident_award_fraction,
  SUM(nonres_int) AS nonres_int_value,
  SUM(grand_int) AS total_int_value
FROM cleaned
GROUP BY ipeds_academic_year, NSF_SUBJ_FIELD_BROAD
ORDER BY NSF_SUBJ_FIELD_BROAD, ipeds_academic_year;

testing below here

In [None]:
%%bigquery
-- ================================================================================
-- ADDITIONAL UTILITY QUERY: GET AVAILABLE SCHOOLS FROM IPEDS
-- ================================================================================
-- Use this query to see what school values are available for filtering
-- Execute separately to explore available schools

SELECT INSTITUTION_NAME
FROM `sevis-beta.sevis_staging.subject_fields_plot3`
GROUP BY INSTITUTION_NAME;
