# Medicare Part D Cleanup
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to do some preliminary cleanup/filtering of the [Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug/data/2019) data. Creating the nodes and edges for Physicians & Prescriptions

### Importing modules

In [1]:
from dask import dataframe as dd
import re
import pickle

### Importing the Medicare Part D Prescriptions Data

In [2]:
cols = ['Prscrbr_NPI','Prscrbr_Last_Org_Name','Prscrbr_First_Name','Prscrbr_City',
        'Prscrbr_State_Abrvtn','Prscrbr_Type','Brnd_Name','Gnrc_Name',
        'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst',
        'Tot_Benes', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills',
        'GE65_Tot_Drug_Cst', 'GE65_Tot_Day_Suply','GE65_Tot_Benes']

In [3]:
part_d = dd.read_csv('Data/Medicare_Part_D_Prescribers_by_Provider_and_Drug_2019.csv',usecols=cols)

### Make the year a part of the data

In [4]:
part_d['Year']='2019'

## Pre-Processing Text Columns

In [5]:
def remove_punc(string):
    '''
    takes in a string and removed all punctuation.
    '''
    return re.sub(r'[^\w\s]','',string)

def preProcess(df):
    '''
    takes a dataframe, applies the remove_punc function
    and then upper cases all text columns
    '''
    df = df.applymap(lambda s:remove_punc(s).upper() if isinstance(s, str) else s)
    return df

In [6]:
part_d = part_d.map_partitions(preProcess)
part_d = part_d.persist()

### Get prescriber_info (Physician Node data)

In [7]:
%%time
# Get prescriber_info
prescr = part_d[['Prscrbr_NPI', 'Prscrbr_Last_Org_Name', 'Prscrbr_First_Name','Prscrbr_City','Prscrbr_Type']]
prescr = prescr.drop_duplicates(subset=['Prscrbr_NPI'])

Wall time: 93.5 ms


#### Checking for multiple distinct physicians for the same Prescrbr_NPI

In [8]:
prescrbr_node_dupes = prescr.groupby('Prscrbr_NPI').count()
prescrbr_node_dupes = prescrbr_node_dupes[prescrbr_node_dupes.Prscrbr_Last_Org_Name>1].compute()
print("There are "+str(prescrbr_node_dupes.shape[0])+ " Physicians with duplicated node data")

There are 0 Physicians with duplicated node data


In [9]:
prescr.to_csv('Data/Outputs_Cleanup/Part_d/prescriber_information.csv',single_file=True,index=False)
del prescr

### Get Drug information (Drug Node data)

In [10]:
drugs = part_d[['Brnd_Name','Gnrc_Name']].drop_duplicates()
# drugs = part_d[['Brnd_Name']].drop_duplicates()
drugs = drugs.reset_index()
drugs.columns = ['MedD_drug_id','Brnd_Name','Gnrc_Name']
# drugs.columns = ['MedD_drug_id','Brnd_Name']

drugs_node_out = drugs.copy()[['MedD_drug_id', 'Brnd_Name']]
drugs_node_out.columns = ['MedD_drug_id', 'brand_name']
#drugs['MedD_drug_id'] = "MedD"+drugs.MedD_drug_id.astype(str)
drugs_node_out.to_csv('Data/Outputs_Cleanup/Part_d/drug_information.csv',single_file=True,index=False)

['C:/Users/10294029/Documents/GitHub/DSE-203-Knowledge-Graph/Data/Outputs_Cleanup/Part_d/drug_information.csv']

### Get Prescription information (Prescription Node data)

In [11]:
%%time
prescriptions = (part_d[['Prscrbr_NPI','Brnd_Name','Gnrc_Name','Tot_Clms',
               'Tot_30day_Fills','Tot_Day_Suply','Tot_Drug_Cst',
               'Tot_Benes','GE65_Tot_Clms','GE65_Tot_30day_Fills',
               'GE65_Tot_Drug_Cst','GE65_Tot_Day_Suply','GE65_Tot_Benes','Year']]
                 .merge(drugs,'left',['Brnd_Name','Gnrc_Name'])
                 .drop(columns = ['Brnd_Name','Gnrc_Name'])
                )

pickle.dump(prescriptions.compute(), open( "Data/Outputs_Cleanup/Part_d/prescription_information.p", "wb" ) )
# prescriptions.to_csv('../Data/Outputs_Cleanup/Part_d/prescription_information.csv',single_file=True,index=False)

Wall time: 7.59 s
