# Homework 1

## Link to github
https://github.com/KhushVak/Hwk-1

## Building the dataset

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

YEAR = 2018

ENROLL_DIR = Path("~/econ470/a0/work/ma-data/ma/enrollment/Extracted Data").expanduser()
SA_DIR     = Path("~/econ470/a0/work/ma-data/ma/service-area/Extracted Data").expanduser()

OUT_DIR = Path("~/econ470/a0/output").expanduser()
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("ENROLL_DIR:", ENROLL_DIR)
print("SA_DIR:", SA_DIR)
print("OUT_DIR:", OUT_DIR)

# test one month exists
print((ENROLL_DIR / f"CPSC_Contract_Info_{YEAR}_01.csv").exists())
print((ENROLL_DIR / f"CPSC_Enrollment_Info_{YEAR}_01.csv").exists())
print((SA_DIR     / f"MA_Cnty_SA_{YEAR}_01.csv").exists())


ENROLL_DIR: /home/kvakta/econ470/a0/work/ma-data/ma/enrollment/Extracted Data
SA_DIR: /home/kvakta/econ470/a0/work/ma-data/ma/service-area/Extracted Data
OUT_DIR: /home/kvakta/econ470/a0/output
True
True
True


In [2]:
CONTRACT_COLUMNS = [
    "contractid", "planid", "org_type", "plan_type", "partd", "snp", "eghp",
    "org_name", "org_marketing_name", "plan_name", "parent_org", "contract_date",
]
ENROLL_COLUMNS = ["contractid", "planid", "ssa", "fips", "state", "county", "enrollment"]

def read_contract(path: Path) -> pd.DataFrame:
    df = pd.read_csv(
        path, skiprows=1, header=None,
        encoding="latin1", encoding_errors="replace",
        low_memory=False
    )
    df = df.iloc[:, :len(CONTRACT_COLUMNS)]
    df.columns = CONTRACT_COLUMNS
    return df

def read_enroll(path: Path) -> pd.DataFrame:
    df = pd.read_csv(
        path, skiprows=1, header=None, na_values=["*"],
        encoding="latin1", encoding_errors="replace",
        low_memory=False
    )
    df = df.iloc[:, :len(ENROLL_COLUMNS)]
    df.columns = ENROLL_COLUMNS
    return df

def read_service_area(path: Path) -> pd.DataFrame:
    df = pd.read_csv(
        path, dtype="string",
        encoding="latin1", encoding_errors="replace",
        low_memory=False
    )
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    if "contract_id" in df.columns:
        df = df.rename(columns={"contract_id": "contractid"})
    if "ssa" in df.columns:
        df["ssa"] = pd.to_numeric(df["ssa"], errors="coerce")
    if "fips" in df.columns:
        df["fips"] = pd.to_numeric(df["fips"], errors="coerce")
    keep = [c for c in ["contractid", "ssa", "fips", "state", "county"] if c in df.columns]
    return df[keep].copy()

print("Readers defined.")
c_test = ENROLL_DIR / f"CPSC_Contract_Info_{YEAR}_01.csv"
e_test = ENROLL_DIR / f"CPSC_Enrollment_Info_{YEAR}_01.csv"
s_test = SA_DIR     / f"MA_Cnty_SA_{YEAR}_01.csv"

print("Contract sample:")
display(read_contract(c_test).head())

print("Enroll sample:")
display(read_enroll(e_test).head())

print("Service area sample:")
display(read_service_area(s_test).head())

Readers defined.
Contract sample:


Unnamed: 0,contractid,planid,org_type,plan_type,partd,snp,eghp,org_name,org_marketing_name,plan_name,parent_org,contract_date
0,90091,,HCPP - 1833 Cost,HCPP - 1833 Cost,No,No,No,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,United Mine Workers of America Health & Retire...,,UMWA Health and Retirement Funds,02/01/1974 0:00:00
1,E0654,801.0,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00
2,E3014,801.0,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,PSERS HOP PROGRAM,Pennsylvania Public School Employees Retiremen...,PSERS Health Options Program (Employer PDP),Commonwealth of PA Pub Schools Retirement System,01/01/2007 0:00:00
3,E4744,801.0,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,MODOT/MSHP MEDICAL AND LIFE INSURANCE PLAN,MISSOURI DEPARTMENT OF TRANSPORTATION,Missouri Department of Transportatio/ Highway ...,Missouri Highways and Transportation Commission,01/01/2007 0:00:00
4,H0022,1.0,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00


Enroll sample:


Unnamed: 0,contractid,planid,ssa,fips,state,county,enrollment
0,E0654,801,2198,,,,
1,E0654,801,55830,,,,
2,E0654,801,2275,,,,
3,E0654,801,1000,1001.0,AL,Autauga,
4,E0654,801,1010,1003.0,AL,Baldwin,13.0


Service area sample:


Unnamed: 0,contractid,ssa,fips,state,county
0,90091,,,,
1,H0022,36110.0,39023.0,OH,Clark
2,H0022,36170.0,39035.0,OH,Cuyahoga
3,H0022,36260.0,39051.0,OH,Fulton
4,H0022,36280.0,39055.0,OH,Geauga


In [3]:
def monthlist(y: int):
    return [f"{m:02d}" for m in range(1, 13)]

def load_month(m: str, y: int) -> pd.DataFrame:
    c_path = ENROLL_DIR / f"CPSC_Contract_Info_{y}_{m}.csv"
    e_path = ENROLL_DIR / f"CPSC_Enrollment_Info_{y}_{m}.csv"

    contract = read_contract(c_path).drop_duplicates(subset=["contractid", "planid"])
    enroll   = read_enroll(e_path)

    df = contract.merge(enroll, on=["contractid", "planid"], how="left")
    df["month"] = int(m)
    df["year"] = y

    # merge key for service-area: prefer FIPS, else SSA
    df["county_key"] = df["fips"].where(df["fips"].notna(), df["ssa"])
    return df

# quick test
m01 = load_month("01", YEAR)
print("Jan shape:", m01.shape)
print(m01[["contractid","planid","state","county","fips","ssa","month"]].head())


Jan shape: (2320300, 20)
  contractid  planid state   county    fips      ssa  month
0      90091     NaN   NaN      NaN     NaN      NaN      1
1      E0654   801.0   NaN      NaN     NaN   2198.0      1
2      E0654   801.0   NaN      NaN     NaN  55830.0      1
3      E0654   801.0   NaN      NaN     NaN   2275.0      1
4      E0654   801.0    AL  Autauga  1001.0   1000.0      1


In [4]:
def build_plan_month(y: int) -> pd.DataFrame:
    frames = []
    for m in monthlist(y):
        print("Loading month", m)
        frames.append(load_month(m, y))
    out = pd.concat(frames, ignore_index=True)
    return out

plan_month = build_plan_month(YEAR)
print("plan_month shape:", plan_month.shape)
def build_service_area_keyed(y: int) -> pd.DataFrame:
    frames = []
    for m in monthlist(y):
        path = SA_DIR / f"MA_Cnty_SA_{y}_{m}.csv"
        df = read_service_area(path)
        df["county_key"] = df["fips"].where(df["fips"].notna(), df["ssa"])
        df = df[df["county_key"].notna()].copy()
        frames.append(df[["contractid", "county_key"]])
    sa = pd.concat(frames, ignore_index=True).drop_duplicates()
    return sa

sa_keyed = build_service_area_keyed(YEAR)
print("sa_keyed shape:", sa_keyed.shape)
print(sa_keyed.head())
plan_month_sa = plan_month.merge(sa_keyed, on=["contractid", "county_key"], how="inner")

print("Before SA merge:", plan_month.shape)
print("After SA merge:", plan_month_sa.shape)

print("Missing state:", plan_month_sa["state"].isna().sum())
print("Missing county:", plan_month_sa["county"].isna().sum())

display(plan_month_sa[["contractid","planid","state","county","fips","ssa","month","enrollment"]].head(10))


Loading month 01
Loading month 02
Loading month 03
Loading month 04
Loading month 05
Loading month 06
Loading month 07
Loading month 08
Loading month 09
Loading month 10
Loading month 11
Loading month 12
plan_month shape: (27710394, 20)
sa_keyed shape: (331547, 2)
  contractid  county_key
0      H0022       39023
1      H0022       39035
2      H0022       39051
3      H0022       39055
4      H0022       39057
Before SA merge: (27710394, 20)
After SA merge: (16094171, 20)
Missing state: 9009
Missing county: 9009


Unnamed: 0,contractid,planid,state,county,fips,ssa,month,enrollment
0,H0022,1.0,OH,Clark,39023.0,36110.0,1,558.0
1,H0022,1.0,OH,Cuyahoga,39035.0,36170.0,1,3596.0
2,H0022,1.0,OH,Fulton,39051.0,36260.0,1,107.0
3,H0022,1.0,OH,Geauga,39055.0,36280.0,1,80.0
4,H0022,1.0,OH,Greene,39057.0,36290.0,1,539.0
5,H0022,1.0,OH,Lake,39085.0,36440.0,1,278.0
6,H0022,1.0,OH,Lorain,39093.0,36480.0,1,559.0
7,H0022,1.0,OH,Lucas,39095.0,36490.0,1,2782.0
8,H0022,1.0,OH,Medina,39103.0,36530.0,1,192.0
9,H0022,1.0,OH,Montgomery,39113.0,36580.0,1,3128.0


In [5]:
plan_month_sa = plan_month_sa.dropna(subset=["state","county"])
print("After dropping missing labels:", plan_month_sa.shape)


After dropping missing labels: (16085162, 20)


In [6]:
final_plans = (
    plan_month_sa
    .groupby(["contractid","planid","county_key","year"], dropna=False, as_index=False)
    .agg(
        avg_enrollment=("enrollment", "mean"),
        min_enrollment=("enrollment", "min"),
        max_enrollment=("enrollment", "max"),
        state=("state", "last"),
        county=("county", "last"),
        fips=("fips", "last"),
        ssa=("ssa", "last"),
        org_type=("org_type", "last"),
        plan_type=("plan_type", "last"),
        partd=("partd", "last"),
        snp=("snp", "last"),
        eghp=("eghp", "last"),
        org_name=("org_name", "last"),
        org_marketing_name=("org_marketing_name", "last"),
        plan_name=("plan_name", "last"),
        parent_org=("parent_org", "last"),
        contract_date=("contract_date", "last"),
    )
)

print("final_plans shape:", final_plans.shape)
display(final_plans.head())


final_plans shape: (1366487, 21)


Unnamed: 0,contractid,planid,county_key,year,avg_enrollment,min_enrollment,max_enrollment,state,county,fips,...,org_type,plan_type,partd,snp,eghp,org_name,org_marketing_name,plan_name,parent_org,contract_date
0,H0022,1.0,39023.0,2018,598.416667,558.0,638.0,OH,Clark,39023.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
1,H0022,1.0,39035.0,2018,3653.0,3549.0,3829.0,OH,Cuyahoga,39035.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
2,H0022,1.0,39051.0,2018,115.833333,107.0,126.0,OH,Fulton,39051.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
3,H0022,1.0,39055.0,2018,77.333333,68.0,84.0,OH,Geauga,39055.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
4,H0022,1.0,39057.0,2018,571.083333,539.0,618.0,OH,Greene,39057.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00


In [7]:
out_path = OUT_DIR / f"ma_plan_county_year_{YEAR}.csv"
final_plans.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: /home/kvakta/econ470/a0/output/ma_plan_county_year_2018.csv


## 1. Creating table

In [8]:
import pandas as pd

df = pd.read_csv("~/econ470/a0/output/ma_plan_county_year_2018.csv")
print(df.shape)
df.head()


(1366487, 21)


Unnamed: 0,contractid,planid,county_key,year,avg_enrollment,min_enrollment,max_enrollment,state,county,fips,...,org_type,plan_type,partd,snp,eghp,org_name,org_marketing_name,plan_name,parent_org,contract_date
0,H0022,1.0,39023.0,2018,598.416667,558.0,638.0,OH,Clark,39023.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
1,H0022,1.0,39035.0,2018,3653.0,3549.0,3829.0,OH,Cuyahoga,39035.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
2,H0022,1.0,39051.0,2018,115.833333,107.0,126.0,OH,Fulton,39051.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
3,H0022,1.0,39055.0,2018,77.333333,68.0,84.0,OH,Geauga,39055.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00
4,H0022,1.0,39057.0,2018,571.083333,539.0,618.0,OH,Greene,39057.0,...,Demo,Medicare-Medicaid Plan HMO/HMOPOS,Yes,No,No,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Health Plan - MyCare Ohio,Buckeye Health Plan - MyCare Ohio (Medicare-Me...,Centene Corporation,05/01/2014 0:00:00


In [9]:
plans = (
    df[["contractid", "planid", "plan_type"]]
    .drop_duplicates()
)
print("Unique plans:", plans.shape[0])


Unique plans: 4217


In [10]:
plan_type_table = (
    plans
    .groupby("plan_type")
    .size()
    .reset_index(name="Number of Plans")
    .sort_values("Number of Plans", ascending=False)
)

plan_type_table


Unnamed: 0,plan_type,Number of Plans
1,HMO/HMOPOS,2678
2,Local PPO,966
5,National PACE,254
7,Regional PPO,109
0,1876 Cost,101
4,Medicare-Medicaid Plan HMO/HMOPOS,54
6,PFFS,50
3,MSA,5


In [12]:
import pandas as pd

df = pd.read_csv("~/econ470/a0/output/ma_plan_county_year_2018.csv")

plans = (
    df[["contractid", "planid", "plan_type", "year"]]
    .drop_duplicates()
)
plan_type_year = (
    plans
    .groupby(["plan_type", "year"])
    .size()
    .reset_index(name="Number of Plans")
)
table1 = (
    plan_type_year
    .pivot(index="plan_type", columns="year", values="Number of Plans")
    .fillna(0)
    .astype(int)
)

table1.index.name = "Plan Type"
table1


year,2018
Plan Type,Unnamed: 1_level_1
1876 Cost,101
HMO/HMOPOS,2678
Local PPO,966
MSA,5
Medicare-Medicaid Plan HMO/HMOPOS,54
National PACE,254
PFFS,50
Regional PPO,109


## 2. Updated Table

In [13]:
import pandas as pd

df = pd.read_csv("~/econ470/a0/output/ma_plan_county_year_2018.csv")
plans = (
    df[["contractid", "planid", "plan_type", "year", "snp", "eghp"]]
    .drop_duplicates()
)
plans_filtered = plans[
    (plans["snp"] != "Y") &
    (plans["eghp"] != "Y") &
    (~plans["planid"].between(800, 899))
]


In [14]:
print("Plans before exclusions:", plans.shape[0])
print("Plans after exclusions:", plans_filtered.shape[0])


Plans before exclusions: 4217
Plans after exclusions: 3267


In [15]:
plan_type_year_filtered = (
    plans_filtered
    .groupby(["plan_type", "year"])
    .size()
    .reset_index(name="Number of Plans")
)
table1_filtered = (
    plan_type_year_filtered
    .pivot(index="plan_type", columns="year", values="Number of Plans")
    .fillna(0)
    .astype(int)
)

table1_filtered.index.name = "Plan Type"
table1_filtered


year,2018
Plan Type,Unnamed: 1_level_1
1876 Cost,93
HMO/HMOPOS,2133
Local PPO,622
MSA,3
Medicare-Medicaid Plan HMO/HMOPOS,54
National PACE,254
PFFS,46
Regional PPO,62


## 3. Table of the average enrollments

In [16]:
import pandas as pd

df = pd.read_csv("~/econ470/a0/output/ma_plan_county_year_2018.csv")
print(df.shape)


(1366487, 21)


In [17]:
df_filtered = df[
    (df["snp"] != "Y") &
    (df["eghp"] != "Y") &
    (~df["planid"].between(800, 899))
].copy()

print("Before exclusions:", df.shape[0])
print("After exclusions:", df_filtered.shape[0])


Before exclusions: 1366487
After exclusions: 103248


In [19]:
avg_enrollment_table = (
    df_filtered
    .groupby("plan_type", as_index=False)
    .agg(
        Average_Enrollment=("avg_enrollment", "mean")
    )
)
avg_enrollment_table["Average_Enrollment"] = (
    avg_enrollment_table["Average_Enrollment"].round(1)
)

avg_enrollment_table


Unnamed: 0,plan_type,Average_Enrollment
0,1876 Cost,251.6
1,HMO/HMOPOS,620.6
2,Local PPO,327.3
3,MSA,58.1
4,Medicare-Medicaid Plan HMO/HMOPOS,989.2
5,National PACE,144.3
6,PFFS,93.7
7,Regional PPO,182.4
