In [2]:
import geopandas as gpd
import pandas as pd

In [11]:
# helper functions for county aggregation

def modify_year(year, tmp_df):
    # Find the oldest county match for each zip
    grouped_df = tmp_df.sort_values('year').groupby('zip').head(1)
    # Update the year column
    grouped_df['year'] = year
    return grouped_df

def county_aggregate(df, xwalk_path):
    zip_county_xwalk = pd.read_csv(xwalk_path,
                                   dtype={"zip": str, "county": str})
    zip_county_xwalk["zip"] = zip_county_xwalk["zip"].astype(str).str.zfill(5)
    zip_county_xwalk["county"] = zip_county_xwalk["county"].astype(str).str.zfill(5)
    df["zip"] = df["zip"].astype(str).str.zfill(5)

    xwalk_lst = [modify_year(year, zip_county_xwalk) for year in range(2000, 2010)]
    # expanded xwalk df works with year+county matches
    final_xwalk_df = pd.concat([zip_county_xwalk] + xwalk_lst, 
                               ignore_index=True)

    # merging with full dataset
    print('original number entries [ZIP code dataset]: ' + str(len(df)))
    print('number of unique zips [zip dataset]: ' + str(len(df["zip"].unique())))
    df_mg = df.merge(final_xwalk_df, 
                    on=["zip", "year"])
    print('number of unique zips [county dataset]: ' + str(len(df_mg["zip"].unique())))
    df_filtered = df_mg.drop(columns=['zip', 
                                    'lat', 
                                    'lon', 
                                    'tot_ratio']) # don't need these cols
    # Group by 'county' and 'year' and take the mean of the remaining columns
    df_aggregated = df_filtered.groupby(['county', 'year'], as_index=False).mean()
    df_aggregated["county"] = df_aggregated["county"].astype(str).str.zfill(5)
    print('final number of entries [county dataset]: '+ str(len(df_aggregated)))
    print('number of unique counties: ' + str(len(df_aggregated["county"].unique())))

    return df_aggregated

In [15]:
# read in zip-level file

zip_df = pd.read_parquet("../data_collections/sim_medicare/data.parquet")
xwalk_path = "../data_collections/sim_medicare_county/zip2county_master_xwalk_2010_2023_tot_ratio_one2one.csv"
county_df = county_aggregate(zip_df, xwalk_path)
county_df.index = county_df["county"]
county_df = county_df.drop(["county"], axis=1)

# saving county dataset
county_df.to_parquet("../data_collections/sim_medicare_county/data.parquet")

original number entries [ZIP code dataset]: 485126
number of unique zips [zip dataset]: 28691
number of unique zips [county dataset]: 28658
final number of entries [county dataset]: 41580
number of unique counties: 2470
