In [8]:
import pandas as pd
import geopandas as gpd
import numpy as np

### Preprocess Individual PCP (primary care providers) data

In [9]:
pcp_medicaid_df = pd.read_csv('./data/pcp_medicaid_2021.csv')
pcp_commercial_df = pd.read_csv('./data/pcp_commercial_2021.csv')

In [10]:
# Cull duplicate providers baased on unique combination of values in 'Name' and 'Street Address'
pcp_medicaid_df = pcp_medicaid_df.drop_duplicates(subset=['National Provider Identifier'])
pcp_commercial_df = pcp_commercial_df.drop_duplicates(subset=['National Provider Identifier'])

# Filter providers within the following 32 counties in NY State
counties_to_keep = ['Albany', 'Cayuga', 'Chautauqua', 'Cortland', 'Erie', 'Genesee', 'Greene', 'Hamilton', 'Lewis', 'Montgomery', 'New York', 'Kings', 'Bronx', 'Queens', 'Richmond', 'Oneida', 'Onondaga', 'Ontario', 'Orange', 'Oswego', 'Putnam', 'Rensselaer', 'Rockland', 'Schuyler', 'St Lawrence', 'Steuben', 'Sullivan', 'Tioga', 'Tompkins', 'Ulster', 'Warren', 'Westchester']
pcp_medicaid_df = pcp_medicaid_df[pcp_medicaid_df['County'].isin(counties_to_keep)]
pcp_commercial_df = pcp_commercial_df[pcp_commercial_df['County'].isin(counties_to_keep)]

# Add 'Medicaid Indicator' column for provider datasets
pcp_medicaid_df['Medicaid Indicator'] = 1
pcp_medicaid_df['Commercial Indicator'] = 0
pcp_commercial_df['Medicaid Indicator'] = 0
pcp_commercial_df['Commercial Indicator'] = 1
print('pcp_medicaid: ', pcp_medicaid_df.shape)
print('pcp_commercial: ', pcp_commercial_df.shape)

# Combine the two DataFrames by appending rows
pcp_df = pd.concat([pcp_medicaid_df, pcp_commercial_df], ignore_index=True)
print('pcp: ', pcp_df.shape)

# Add up 'Medicaid Indicator' value for duplicate providers based on combination of values in the follwing columns, while culling them.
pcp_df = pcp_df.groupby(['First Name', 'Last Name', 'Street Address', 'County', 'Latitude', 'Longitude']).agg({'Medicaid Indicator': 'sum', 'Commercial Indicator': 'sum'}).reset_index()
pcp_df = pcp_df.dropna()
print('pcp_culled: ', pcp_df.shape)

pcp_medicaid:  (16502, 16)
pcp_commercial:  (37456, 16)
pcp:  (53958, 16)
pcp_culled:  (45694, 8)


In [12]:
pcp_df.to_csv('pcp_individual_2021.csv', index=False)

### Prepare Facility PCP from the Individual PCP

In [13]:
# Group by 'Latitude' and 'Longitude', and sum 'Medicaid Indicator' and 'Commercial Indicator' separately
pcp_facilities_df = pcp_df.groupby(['Latitude', 'Longitude', 'County']).agg(
    pcp_medicaid=('Medicaid Indicator', 'sum'),
    pcp_commercial=('Commercial Indicator', 'sum')
).reset_index()

print(pcp_facilities_df.shape)
print(pcp_facilities_df.sample(20))

(7495, 5)
       Latitude  Longitude       County  pcp_medicaid  pcp_commercial
5337  41.153336 -73.826162  Westchester             0               5
5559  41.377205 -74.692792       Orange             1               0
5425  41.213844 -73.986108     Rockland             1               2
4107  40.813776 -73.944498     New York             0               1
1611  40.682767 -73.966560        Kings             8               5
2977  40.747293 -73.986209     New York             1               1
4249  40.830002 -73.910667        Bronx             3               4
5808  41.802662 -74.747406     Sullivan             0               1
1221  40.645678 -73.958248        Kings             4               7
521   40.608373 -74.004194        Kings             2               2
1948  40.703194 -73.814920       Queens             0               1
1448  40.669877 -73.908512        Kings             0               1
1848  40.697536 -73.993285        Kings             5               1
2010  40.7

In [14]:
pcp_facilities_df.to_csv('pcp_facilities_2021.csv', index=False)