# Graduate Cohort OPT Analysis

This notebook analyzes graduates by fiscal year and their transition to OPT.

**Prerequisites:** Run `create_staging_tables.ipynb` first to create the staging data.

**Data source:** `../data/dta/grad_cohort_opt_plot1.parquet`

In [None]:
import duckdb
import pandas as pd

# Connect to DuckDB
con = duckdb.connect()
print("Connected to DuckDB")

## Filter Parameters

Set your filters here. Use `None` to include all values.

In [None]:
# Filter parameters (set to None to include all)
fys = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
campus_states = None  # e.g., ['california', 'massachusetts']
campus_lmas = None
edu_levels = None  # e.g., ['masters', 'bachelors', 'doctorate']
is_stem = None  # True, False, or None
nsf_fields = None
birth_countries = None
schools = None  # e.g., ['loras college']

print("Filters configured")

In [None]:
# Build WHERE clause based on filters
where_conditions = []

if fys is not None:
    fy_list = ",".join([str(y) for y in fys])
    where_conditions.append(f"fiscal_year IN ({fy_list})")

if campus_states is not None:
    states_list = ",".join([f"'{s.lower()}'" for s in campus_states])
    where_conditions.append(f"LOWER(Campus_State) IN ({states_list})")

if campus_lmas is not None:
    lmas_list = ",".join([f"'{l}'" for l in campus_lmas])
    where_conditions.append(f"CAMPUS_LMA IN ({lmas_list})")

if edu_levels is not None:
    edu_list = ",".join([f"'{e}'" for e in edu_levels])
    where_conditions.append(f"Student_Edu_Level_Desc IN ({edu_list})")

if is_stem is not None:
    where_conditions.append(f"IS_STEM = {is_stem}")

if nsf_fields is not None:
    nsf_list = ",".join([f"'{n}'" for n in nsf_fields])
    where_conditions.append(f"NSF_SUBJ_FIELD_BROAD IN ({nsf_list})")

if birth_countries is not None:
    countries_list = ",".join([f"'{c}'" for c in birth_countries])
    where_conditions.append(f"Country_of_Birth IN ({countries_list})")

if schools is not None:
    schools_list = ",".join([f"'{s.lower()}'" for s in schools])
    where_conditions.append(f"LOWER(School_Name) IN ({schools_list})")

where_clause = " AND ".join(where_conditions) if where_conditions else "1=1"

print(f"WHERE clause: {where_clause}")

## Query: Graduates and OPT Usage by Year

In [None]:
query = f"""
    SELECT
      fiscal_year AS year,
      COUNT(DISTINCT SEVIS_ID) AS total_graduates,
      COUNT(DISTINCT CASE WHEN used_opt = 1 THEN SEVIS_ID END) AS grads_to_opt_total,
      COUNT(DISTINCT CASE WHEN used_opt = 0 THEN SEVIS_ID END) AS non_opt_total
    FROM read_parquet('../data/dta/grad_cohort_opt_plot1.parquet')
    WHERE {where_clause}
    GROUP BY year
    ORDER BY year
"""

result = con.execute(query).df()
display(result)

## Additional Analysis

You can add more queries here to explore the data further:
- Breakdown by STEM vs non-STEM
- Breakdown by education level
- Breakdown by state or LMA
- Etc.

In [None]:
# Example: Breakdown by STEM status
query = f"""
    SELECT 
      IS_STEM, 
      COUNT(DISTINCT SEVIS_ID) AS graduates, 
      COUNT(DISTINCT CASE WHEN used_opt = 1 THEN SEVIS_ID END) AS opt_users
    FROM read_parquet('../data/dta/grad_cohort_opt_plot1.parquet') 
    WHERE {where_clause} 
    GROUP BY IS_STEM
"""

result = con.execute(query).df()
display(result)

In [None]:
# Close connection
con.close()
print("DuckDB connection closed")