In [8]:
from __future__ import annotations

import numpy as np
import pandas as pd
from pathlib import Path

This code enables Python features and imports commonly used libraries for data analysis and file handling.

In [16]:
BASE_DIR = Path("Econ470/a0/work/ma/Extracted Data")


def monthlist_for_year(y: int) -> list[str]:
    months = range(7, 13) if y == 2006 else range(1, 13)
    return [f"{m:02d}" for m in months]


This code defines a base directory and a helper function for working with month values. Although my file path is still incorrect I need to make some changes to correct this. The function monthlist_for_year takes a year as input and returns a list of month strings in two-digit format ("01"–"12"). For the year 2006 only, it returns July through December ("07"–"12"), and for all other years it returns January through December, which works for us as we need data from 2018.

In [20]:
CONTRACT_COLUMNS = [
    "contractid", "planid", "org_type", "plan_type", "partd", "snp", "eghp",
    "org_name", "org_marketing_name", "plan_name", "parent_org", "contract_date",
]
CONTRACT_DTYPES = {
    "contractid": "string",
    "planid": "float64",
    "org_type": "string",
    "plan_type": "string",
    "partd": "string",
    "snp": "string",
    "eghp": "string",
    "org_name": "string",
    "org_marketing_name": "string",
    "plan_name": "string",
    "parent_org": "string",
    "contract_date": "string",
}

ENROLL_COLUMNS = [
    "contractid", "planid", "ssa", "fips", "state", "county", "enrollment"
]
ENROLL_DTYPES = {
    "contractid": "string",
    "planid": "float64",
    "ssa": "float64",
    "fips": "float64",
    "state": "string",
    "county": "string",
    "enrollment": "float64",
}

This code defines metadata for the two types of datasets where one for contracts and one for enrollment by listing their expected column names and specifying the data type for each column. We need to ensure consistent column order and correct data types, which helps prevent parsing errors and makes downstream analysis more reliable.

In [11]:
def read_contract(path: Path) -> pd.DataFrame:
    return pd.read_csv(
        path,
        skiprows=1,
        header=None,
        names=CONTRACT_COLUMNS,
        dtype=CONTRACT_DTYPES,
        low_memory=False,
    )


def read_enroll(path: Path) -> pd.DataFrame:
    return pd.read_csv(
        path,
        skiprows=1,
        header=None,
        names=ENROLL_COLUMNS,
        dtype=ENROLL_DTYPES,
        na_values=["*"],
        low_memory=False,
    )



def fill_downup(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    df = df.copy()
    df[cols] = df[cols].ffill().bfill()
    return df



In [12]:
def load_month(m: str, y: int) -> pd.DataFrame:
    c_path = BASE_DIR / f"CPSC_Contract_Info_{y}_{m}.csv"
    e_path = BASE_DIR / f"CPSC_Enrollment_Info_{y}_{m}.csv"

    contract_info = (
        read_contract(c_path)
        .drop_duplicates(subset=["contractid", "planid"], keep="first")
    )

    enroll_info = read_enroll(e_path)

    return (
        contract_info
        .merge(enroll_info, on=["contractid", "planid"], how="left")
        .assign(month=int(m), year=y)
    )


def build_plan_year(y: int) -> pd.DataFrame:
    monthlist = monthlist_for_year(y)

    plan_year = pd.concat(
        [load_month(m, y) for m in monthlist],
        ignore_index=True
    )

    plan_year = plan_year.sort_values(
        ["contractid", "planid", "state", "county", "month"],
        kind="mergesort"
    )

    plan_year = (
        plan_year
        .groupby(["state", "county"], dropna=False, group_keys=False)
        .apply(lambda g: fill_downup(g, ["fips"]))
        .groupby(["contractid", "planid"], dropna=False, group_keys=False)
        .apply(lambda g: fill_downup(g, ["plan_type", "partd", "snp", "eghp", "plan_name"]))
        .groupby("contractid", dropna=False, group_keys=False)
        .apply(lambda g: fill_downup(g, ["org_type", "org_name", "org_marketing_name", "parent_org"]))
        .reset_index(drop=True)
    )

    return plan_year


In [13]:
def collapse_to_yearly_panel(plan_year: pd.DataFrame) -> pd.DataFrame:
    plan_year = plan_year.sort_values(
        ["contractid", "planid", "fips", "year", "month"],
        kind="mergesort"
    )

    def summarize(g):
        enroll = g["enrollment"]
        nonmiss = enroll.notna().sum()
        vals = enroll.dropna()

        return pd.Series({
            "n_nonmiss": nonmiss,
            "avg_enrollment": vals.mean() if nonmiss > 0 else np.nan,
            "sd_enrollment": vals.std(ddof=1) if nonmiss > 1 else np.nan,
            "min_enrollment": vals.min() if nonmiss > 0 else np.nan,
            "max_enrollment": vals.max() if nonmiss > 0 else np.nan,
            "first_enrollment": vals.iloc[0] if nonmiss > 0 else np.nan,
            "last_enrollment": vals.iloc[-1] if nonmiss > 0 else np.nan,
            "state": g["state"].iloc[-1],
            "county": g["county"].iloc[-1],
            "org_type": g["org_type"].iloc[-1],
            "plan_type": g["plan_type"].iloc[-1],
            "partd": g["partd"].iloc[-1],
            "snp": g["snp"].iloc[-1],
            "eghp": g["eghp"].iloc[-1],
            "org_name": g["org_name"].iloc[-1],
            "org_marketing_name": g["org_marketing_name"].iloc[-1],
            "plan_name": g["plan_name"].iloc[-1],
            "parent_org": g["parent_org"].iloc[-1],
            "contract_date": g["contract_date"].iloc[-1],
            "year": g["year"].iloc[-1],
        })

    return (
        plan_year
        .groupby(["contractid", "planid", "fips", "year"], dropna=False)
        .apply(summarize)
        .reset_index()
    )



In [None]:
if __name__ == "__main__":
    y = 2018
    plan_year = build_plan_year(y)
    final_plans = collapse_to_yearly_panel(plan_year)
     

This code takes raw plan data for 2018, processes it, and creates a summarized yearly panel of plans for the final combined data set

In [None]:
import pandas as pd

# Count the number of plans for each plan_type
plan_counts = final_plans['plan_type'].value_counts().reset_index()

# Rename the columns for readability
plan_counts.columns = ['Plan Type', 'Number of Plans']

# Optional: sort by number of plans descending
plan_counts = plan_counts.sort_values(by='Number of Plans', ascending=False)

# Display the table
print(plan_counts)


this is the base of the code to create the table based on the plans 

In [None]:
import pandas as pd

# Filter out special needs plans (SNP), employer group plans (EGHP), and 800-series plans
filtered_plans = final_plans[
    (final_plans['snp'] != 'Y') &                 # remove SNP plans
    (final_plans['eghp'] != 'Y') &               # remove EGHP plans
    (~final_plans['planid'].astype(str).str.startswith('8'))  # remove 800-series plan IDs
]

# Recompute the count of plans by plan_type
filtered_plan_counts = filtered_plans['plan_type'].value_counts().reset_index()
filtered_plan_counts.columns = ['Plan Type', 'Number of Plans']

# Optional: sort by number of plans descending
filtered_plan_counts = filtered_plan_counts.sort_values(by='Number of Plans', ascending=False)

# Display the updated table
print(filtered_plan_counts)

this is the base of the code to Remove all special needs plans

In [None]:
import pandas as pd

# Merge filtered plans with service area to keep only approved counties
# Assuming service_area has columns: 'planid' and 'county'
approved_plans = filtered_plans.merge(
    service_area[['planid', 'county']],
    on=['planid', 'county'],
    how='inner'  # keeps only matches
)

# Compute average enrollment by plan_type
avg_enrollment = approved_plans.groupby('plan_type')['enrollment'].mean().reset_index()

# Rename columns for clarity
avg_enrollment.columns = ['Plan Type', 'Average Enrollment']

# Optional: sort descending by average enrollment
avg_enrollment = avg_enrollment.sort_values(by='Average Enrollment', ascending=False)

# Display the table
print(avg_enrollment)
