# Sunshine Act Cleanup
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to do some preliminary cleanup/filtering of the [Center for Medicare & Medicaid Services](https://openpaymentsdata.cms.gov) data. Primarily, filter the data to just California to limit the amount of data we need to load into the Neo4J graph.

### Importing modules

In [12]:
from dask import dataframe as dd

In [13]:
cols = ['Covered_Recipient_Type','Teaching_Hospital_CCN','Teaching_Hospital_ID','Teaching_Hospital_Name',
       'Physician_Profile_ID','Physician_First_Name','Physician_Middle_Name','Physician_Last_Name','Physician_Name_Suffix',
       'Recipient_State','Physician_Primary_Type','Physician_Specialty','Physician_License_State_code1','Physician_License_State_code2',
       'Physician_License_State_code3','Physician_License_State_code4','Physician_License_State_code5',
       'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID',
       'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
       'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State',
       'Total_Amount_of_Payment_USDollars','Date_of_Payment','Number_of_Payments_Included_in_Total_Amount',
       'Form_of_Payment_or_Transfer_of_Value','Nature_of_Payment_or_Transfer_of_Value','City_of_Travel',
       'State_of_Travel', 'Country_of_Travel','Physician_Ownership_Indicator','Third_Party_Payment_Recipient_Indicator',
       'Name_of_Third_Party_Entity_Receiving_Payment_or_Transfer_of_Value','Charity_Indicator','Third_Party_Equals_Covered_Recipient_Indicator',
       'Record_ID','Related_Product_Indicator','Covered_or_Noncovered_Indicator_1',
       'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1',
       'Product_Category_or_Therapeutic_Area_1','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1',
       'Associated_Drug_or_Biological_NDC_1','Covered_or_Noncovered_Indicator_2',
       'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2',
       'Product_Category_or_Therapeutic_Area_2',
       'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2',
       'Associated_Drug_or_Biological_NDC_2',
       'Covered_or_Noncovered_Indicator_3',
       'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3',
       'Product_Category_or_Therapeutic_Area_3',
       'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3',
       'Associated_Drug_or_Biological_NDC_3',
       'Covered_or_Noncovered_Indicator_4',
       'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4',
       'Product_Category_or_Therapeutic_Area_4',
       'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4',
       'Associated_Drug_or_Biological_NDC_4',
       'Covered_or_Noncovered_Indicator_5',
       'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5',
       'Product_Category_or_Therapeutic_Area_5',
       'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5',
       'Associated_Drug_or_Biological_NDC_5','Recipient_Zip_Code']

In [14]:
#Load the data
sunshine = dd.read_csv('../Data/OP_DTL_GNRL_PGYR2019_P06302021.csv',dtype=str,usecols=cols)

In [15]:
# Dealing with missing data for recipient state
def licensedCA(row):
    if row['Recipient_State']=='CA' or row['Physician_License_State_code1']=='CA' or row['Physician_License_State_code1']=='CA' or row['Physician_License_State_code2']=='CA' or row['Physician_License_State_code3']=='CA' or row['Physician_License_State_code4']=='CA' or row['Physician_License_State_code5']=='CA':
        return True
    return False

sunshine['Licensed_In_CA'] = sunshine.apply(licensedCA,axis=1,meta=str)
        

### CA Filter
1. Filter only for California 
2. Persisting `cadf` in memory so we don't need to repeat this filtering step every time we want to create a new output file

In [16]:
%%time
cadf = sunshine[sunshine['Licensed_In_CA']==True]
cadf = cadf.persist()
del sunshine

CPU times: user 3min 49s, sys: 6.56 s, total: 3min 55s
Wall time: 3min 26s


### Creating Hospital-Payment edge

In [17]:
%%time
# Hospital ID and Payment ID
hosp = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Teaching Hospital'][['Record_ID','Teaching_Hospital_ID']]
hosp.to_csv('../Data/Outputs_Cleanup/Sunshine/hospitals_payment.csv',single_file=True,index=False)

### Creating Hospital-State edge

In [38]:
%%time
# Hospitals and states
hosp = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Teaching Hospital'][['Recipient_State','Teaching_Hospital_ID']]
hosp = hosp.drop_duplicates(subset=['Teaching_Hospital_ID'])

CPU times: user 7.21 ms, sys: 252 µs, total: 7.46 ms
Wall time: 7.56 ms


In [47]:
hospital_state_dupes = hosp.groupby('Teaching_Hospital_ID').count().compute()
hosp_state_dupes = hospital_state_dupes[hospital_state_dupes.Recipient_State>1]
print("There are "+str(hosp_state_dupes.shape[0])+ " hospitals with more than 1 associated state")

There are 0 hospitals with more than 1 associated state


In [48]:
hosp.to_csv('../Data/Outputs_Cleanup/Sunshine/hosp_states.csv',single_file=True,index=False)

['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/hosp_states.csv']

### Crating Hospital Node

In [49]:
%%time
# Hospitals' other information
hosp = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Teaching Hospital'][['Teaching_Hospital_CCN',
                                                                                    'Teaching_Hospital_ID',
                                                                                    'Teaching_Hospital_Name']]
hosp = hosp.drop_duplicates(subset=['Teaching_Hospital_ID'])

CPU times: user 7.53 ms, sys: 286 µs, total: 7.82 ms
Wall time: 7.9 ms


In [54]:
hospital_node_dupes = hosp.groupby('Teaching_Hospital_ID').count().compute()
hosp_node_dupes = hospital_node_dupes[hospital_node_dupes.Teaching_Hospital_CCN>1]
print("There are "+str(hosp_node_dupes.shape[0])+ " hospitals with more than 1 associated state")

There are 0 hospitals with more than 1 associated state


In [55]:
hosp.to_csv('../Data/Outputs_Cleanup/Sunshine/hosp_info.csv',single_file=True,index=False)
del hosp

### Creating Physician-Payment edge

In [56]:
%%time
# Physician ID and Payment ID
physicians = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Physician'][['Record_ID','Physician_Profile_ID']]
physicians.to_csv('../Data/Outputs_Cleanup/Sunshine/physicians_payments.csv',single_file=True,index=False)

CPU times: user 2.52 s, sys: 85.7 ms, total: 2.61 s
Wall time: 2.56 s


['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/physicians_payments.csv']

### Creating Physician-State edge

In [57]:
%%time
# Physicians and states
physicians = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Physician'][['Recipient_State','Physician_Profile_ID']]
physicians = physicians.drop_duplicates(subset=['Physician_Profile_ID'])

CPU times: user 7.64 ms, sys: 310 µs, total: 7.95 ms
Wall time: 8.1 ms


In [62]:
physician_state_dupes = physicians.groupby('Physician_Profile_ID').count()
physician_state_dupes = physician_state_dupes[physician_state_dupes.Recipient_State>1].compute()
print("There are "+str(physician_state_dupes.shape[0])+ " physcians with more than 1 associated state")

There are 0 physcians with more than 1 associated state


In [64]:
physicians.to_csv('../Data/Outputs_Cleanup/Sunshine/physicians_state.csv',single_file=True,index=False)

['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/physicians_state.csv']

### Creating Physician Node

In [66]:
%%time
# Physicians other information
physicians = cadf[cadf['Covered_Recipient_Type']=='Covered Recipient Physician'][['Physician_Profile_ID','Physician_First_Name','Physician_Middle_Name','Physician_Last_Name','Physician_Name_Suffix',
       'Recipient_State','Physician_Primary_Type','Physician_Specialty','Physician_License_State_code1','Physician_License_State_code2',
       'Physician_License_State_code3','Physician_License_State_code4','Physician_License_State_code5','Recipient_Zip_Code']]
physicians = physicians.drop_duplicates(subset=['Physician_Profile_ID'])

CPU times: user 9.71 ms, sys: 486 µs, total: 10.2 ms
Wall time: 10.4 ms


In [70]:
physician_node_dupes = physicians.groupby('Physician_Profile_ID').count()
physician_node_dupes =physician_node_dupes[physician_node_dupes.Physician_First_Name>1].compute()
print("There are "+str(physician_node_dupes.shape[0])+ " physcians with duplicated node data")

There are 0 physcians with duplicated node data


In [65]:
physicians.to_csv('../Data/Outputs_Cleanup/Sunshine/physicians_info.csv',single_file=True,index=False)
del physicians

CPU times: user 2.08 s, sys: 58 ms, total: 2.13 s
Wall time: 1.99 s


### Creating Payment Node

In [73]:
%%time
# Payment information
payments = cadf[['Record_ID','Total_Amount_of_Payment_USDollars','Date_of_Payment','Number_of_Payments_Included_in_Total_Amount',
                 'Form_of_Payment_or_Transfer_of_Value','Nature_of_Payment_or_Transfer_of_Value']]
payments.to_csv('../Data/Outputs_Cleanup/Sunshine/payments.csv',single_file=True,index=False)
del payments

CPU times: user 3.1 s, sys: 80.5 ms, total: 3.18 s
Wall time: 3.19 s


### Creating PharmCo-Payment Edge

In [74]:
%%time
# Pharmco and Record ID
pharmCo = cadf[['Record_ID','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID']]
pharmCo.to_csv('../Data/Outputs_Cleanup/Sunshine/pharmCo_payments.csv',single_file=True,index=False)

CPU times: user 1.67 s, sys: 41.1 ms, total: 1.71 s
Wall time: 1.7 s


['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/pharmCo_payments.csv']

### Creating PharmCo-State Edge

In [90]:
%%time
# Pharmco and state
pharmCo = cadf[['Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID','Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State']]
pharmCo = pharmCo.drop_duplicates()

CPU times: user 5.67 ms, sys: 214 µs, total: 5.88 ms
Wall time: 5.91 ms


In [91]:
pharmco_state_dupes = pharmCo.groupby('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID').count()
pharmco_state_dupes = pharmco_state_dupes[pharmco_state_dupes.Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State>1].compute()
print("There are "+str(pharmco_state_dupes.shape[0])+ " PharmCo's with multiple states")

There are 0 PharmCo's with multiple states


In [93]:
pharmCo.to_csv('../Data/Outputs_Cleanup/Sunshine/pharmCo_states.csv',single_file=True,index=False)

['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/pharmCo_states.csv']

### Creating PharmCo Node

In [94]:
%%time
# Pharmco other information
pharmCo = cadf[['Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID','Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name',
                'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name','Related_Product_Indicator']]

CPU times: user 2.24 ms, sys: 103 µs, total: 2.34 ms
Wall time: 2.5 ms


In [95]:
pharmco_node_dupes = pharmCo.groupby('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID').count()
pharmco_node_dupes = pharmco_node_dupes[pharmco_node_dupes.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name>1].compute()
print("There are "+str(pharmco_state_dupes.shape[0])+ " PharmCo's with duplicate node data")

There are 0 PharmCo's with duplicate node data


In [96]:
pharmCo.to_csv('../Data/Outputs_Cleanup/Sunshine/pharmCo_info.csv',single_file=True,index=False)

['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/pharmCo_info.csv']

In [97]:
%%time
# Pharmco other information
pharmCo = cadf[['Record_ID','Covered_or_Noncovered_Indicator_1',
                'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1','Product_Category_or_Therapeutic_Area_1',
                'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1','Associated_Drug_or_Biological_NDC_1','Covered_or_Noncovered_Indicator_2',
                'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2','Product_Category_or_Therapeutic_Area_2',
                'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2','Associated_Drug_or_Biological_NDC_2',
                'Covered_or_Noncovered_Indicator_3','Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3',
                'Product_Category_or_Therapeutic_Area_3','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3',
                'Associated_Drug_or_Biological_NDC_3','Covered_or_Noncovered_Indicator_4',
                'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4','Product_Category_or_Therapeutic_Area_4',
                'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4','Associated_Drug_or_Biological_NDC_4',
                'Covered_or_Noncovered_Indicator_5','Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5',
                'Product_Category_or_Therapeutic_Area_5','Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5',
                'Associated_Drug_or_Biological_NDC_5']]
pharmCo.to_csv('../Data/Outputs_Cleanup/Sunshine/record_drugs.csv',single_file=True,index=False)

CPU times: user 4.55 s, sys: 128 ms, total: 4.68 s
Wall time: 4.71 s


['/Users/jonahbreslow/Documents/DSE/2020-jfbreslow/DSE203/DSE-203-Knowledge-Graph/Data Cleanup/../Data/Outputs_Cleanup/Sunshine/record_drugs.csv']