In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

### Preprocess Individual PCP (primary care providers) data

In [44]:
pcp_medicaid_df = pd.read_csv('./data/pcp_medicaid_2021.csv')
pcp_commercial_df = pd.read_csv('./data/pcp_commercial_2021.csv')

In [52]:
# Cull duplicate providers baased on unique combination of values in 'Name' and 'Street Address'
pcp_medicaid_df = pcp_medicaid_df.drop_duplicates(subset=['First Name', 'Last Name', 'Street Address'])
pcp_commercial_df = pcp_commercial_df.drop_duplicates(subset=['First Name', 'Last Name', 'Street Address'])

# Filter providers within the following 32 counties in NY State
counties_to_keep = ['Albany', 'Cayuga', 'Chautauqua', 'Cortland', 'Erie', 'Genesee', 'Greene', 'Hamilton', 'Lewis', 'Montgomery', 'New York', 'Kings', 'Bronx', 'Queens', 'Richmond', 'Oneida', 'Onondaga', 'Ontario', 'Orange', 'Oswego', 'Putnam', 'Rensselaer', 'Rockland', 'Schuyler', 'St Lawrence', 'Steuben', 'Sullivan', 'Tioga', 'Tompkins', 'Ulster', 'Warren', 'Westchester']
pcp_medicaid_df = pcp_medicaid_df[pcp_medicaid_df['County'].isin(counties_to_keep)]
pcp_commercial_df = pcp_commercial_df[pcp_commercial_df['County'].isin(counties_to_keep)]

# Add 'Medicaid Indicator' column for provider datasets
pcp_medicaid_df['Medicaid Indicator'] = 1
pcp_medicaid_df['Commercial Indicator'] = 0
pcp_commercial_df['Medicaid Indicator'] = 0
pcp_commercial_df['Commercial Indicator'] = 1
print('pcp_medicaid: ', pcp_medicaid_df.shape)
print('pcp_commercial: ', pcp_commercial_df.shape)

# Combine the two DataFrames by appending rows
pcp_df = pd.concat([pcp_medicaid_df, pcp_commercial_df], ignore_index=True)
print('pcp: ', pcp_df.shape)

# Add up 'Medicaid Indicator' value for duplicate providers based on combination of values in the follwing columns, while culling them.
pcp_df = pcp_df.groupby(['First Name', 'Last Name', 'Street Address', 'County', 'Latitude', 'Longitude']).agg({'Medicaid Indicator': 'sum', 'Commercial Indicator': 'sum'}).reset_index()
pcp_df = pcp_df.dropna()
print('pcp_culled: ', pcp_df.shape)
print(pcp_df.sample(10))

pcp_medicaid:  (42991, 15)
pcp_commercial:  (142005, 15)
pcp:  (184996, 15)
pcp_culled:  (146955, 8)
       First Name  Last Name                Street Address       County  \
131847      STEVE       PAIK                 55 PALMER AVE  Westchester   
121949      SAMIN     SHARMA           1 GUSTAVE L LEVY PL     New York   
70093       KAREN       TEOH  2111 FREDERICK DOUGLASS BLVD     New York   
51590    HUI HING        TIN                  6317 4TH AVE        Kings   
66252      JOSEPH    MAILMAN        525 E 68TH ST MICU-5 S     New York   
46686         GIL    FARKASH            1185 SWEET HOME RD         Erie   
136089      TANYA   SHNEYDER               3414 CHURCH AVE        Kings   
21263     CASSIDY       DAHN                   150 55TH ST        Kings   
13236      ARTHUR      TOLIS                 61 EMERALD PL     Sullivan   
100187       NANA  MAKALATIA              4781/83 BROADWAY     New York   

         Latitude  Longitude  Medicaid Indicator  Commercial Indicator  


In [53]:
pcp_df.to_csv('pcp_individual_2021.csv', index=False)

### Prepare Facility PCP from the Individual PCP

In [55]:
# Group by 'Latitude' and 'Longitude', and sum 'Medicaid Indicator' and 'Commercial Indicator' separately
pcp_facilities_df = pcp_df.groupby(['Latitude', 'Longitude', 'County']).agg(
    pcp_medicaid=('Medicaid Indicator', 'sum'),
    pcp_commercial=('Commercial Indicator', 'sum')
).reset_index()

print(pcp_facilities_df.shape)
print(pcp_facilities_df.sample(20))

(13181, 5)
        Latitude  Longitude    County  pcp_medicaid  pcp_commercial
12122  42.962230 -78.711617      Erie             4              11
12902  43.251203 -75.177848    Oneida             1               2
3364   40.702676 -73.990859     Kings             1               0
1645   40.634008 -73.989349     Kings             0               1
1844   40.638339 -74.007862     Kings             3               5
736    40.601766 -74.089958  Richmond             0               1
12967  43.322751 -73.640727    Warren             0               2
3061   40.692800 -73.987974     Kings             0               1
10936  42.472994 -76.451296  Tompkins             1               1
4392   40.728964 -73.951214     Kings             1               1
12856  43.210724 -75.457852    Oneida             2              22
1324   40.622539 -74.016082     Kings             0               1
11989  42.939403 -78.872547      Erie             0               1
10488  41.926159 -73.994088    Ulster

In [56]:
pcp_facilities_df.to_csv('pcp_facilities_2021.csv', index=False)