# Medicare Part D Cleanup
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to do some preliminary cleanup/filtering of the [Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug/data/2019) data. Creating the nodes and edges for Physicians & Prescriptions

### Importing modules

In [1]:
from dask import dataframe as dd
import re

In [2]:
cols = ['Prscrbr_NPI','Prscrbr_Last_Org_Name','Prscrbr_First_Name','Prscrbr_City',
        'Prscrbr_State_Abrvtn','Prscrbr_Type','Brnd_Name','Gnrc_Name',
        'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst',
        'Tot_Benes', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills',
        'GE65_Tot_Drug_Cst', 'GE65_Tot_Day_Suply','GE65_Tot_Benes']

In [3]:
part_d = dd.read_csv('../Data/Medicare_Part_D_Prescribers_by_Provider_and_Drug_2019.csv',usecols=cols)

In [4]:
# part_d.head()

In [5]:
part_d.columns

Index(['Prscrbr_NPI', 'Prscrbr_Last_Org_Name', 'Prscrbr_First_Name',
       'Prscrbr_City', 'Prscrbr_State_Abrvtn', 'Prscrbr_Type', 'Brnd_Name',
       'Gnrc_Name', 'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply',
       'Tot_Drug_Cst', 'Tot_Benes', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills',
       'GE65_Tot_Drug_Cst', 'GE65_Tot_Day_Suply', 'GE65_Tot_Benes'],
      dtype='object')

### Make the year a part of the data

In [6]:
part_d['Year']='2019'

## Pre-Processing Text Columns

In [7]:
def remove_punc(string):
    '''
    takes in a string and removed all punctuation.
    '''
    return re.sub(r'[^\w\s]','',string)

def preProcess(df):
    '''
    takes a dataframe, applies the remove_punc function
    and then upper cases all text columns
    '''
    df = df.applymap(lambda s:remove_punc(s).upper() if isinstance(s, str) else s)
    return df

In [8]:
part_d = part_d.map_partitions(preProcess)
part_d = part_d.persist()

### Get prescriber_info (Physician Node)

In [9]:
%%time
# Get prescriber_info
prescr = part_d[['Prscrbr_NPI', 'Prscrbr_Last_Org_Name', 'Prscrbr_First_Name','Prscrbr_City','Prscrbr_Type']]
prescr = prescr.drop_duplicates(subset=['Prscrbr_NPI'])

CPU times: user 11.6 ms, sys: 993 µs, total: 12.6 ms
Wall time: 16.2 ms


#### Checking for multiple distinct physicians for the same Prescrbr_NPI

In [10]:
prescrbr_node_dupes = prescr.groupby('Prscrbr_NPI').count()
prescrbr_node_dupes = prescrbr_node_dupes[prescrbr_node_dupes.Prscrbr_Last_Org_Name>1].compute()
print("There are "+str(prescrbr_node_dupes.shape[0])+ " Physicians with duplicated node data")

There are 0 Physicians with duplicated node data


In [11]:
prescr.to_csv('../Data/Outputs_Cleanup/Part_d/prescriber_information.csv',single_file=True,index=False)
del prescr

### Get Drug information (Drug Node)

In [12]:
drugs = part_d[['Brnd_Name','Gnrc_Name']].drop_duplicates()
drugs = drugs.reset_index()
drugs.columns = ['MedD_drug_id','Brnd_Name','Gnrc_Name']
drugs_node_out = drugs.copy()[['MedD_drug_id', 'Brnd_Name']]
drugs_node_out.columns = ['MedD_drug_id', 'brand_name']
#drugs['MedD_drug_id'] = "MedD"+drugs.MedD_drug_id.astype(str)
drugs_node_out.to_csv('../Data/Outputs_Cleanup/Part_d/drug_information.csv',single_file=True,index=False)

['/Users/jeff_1/Documents/DSE/DSE203/DSE-203-Knowledge-Graph/Data/Outputs_Cleanup/Part_d/drug_information.csv']

part_dGet Prescription information (Prescription Node)

In [13]:
%%time
prescriptions = (part_d[['Prscrbr_NPI','Brnd_Name','Gnrc_Name','Tot_Clms',
               'Tot_30day_Fills','Tot_Day_Suply','Tot_Drug_Cst',
               'Tot_Benes','GE65_Tot_Clms','GE65_Tot_30day_Fills',
               'GE65_Tot_Drug_Cst','GE65_Tot_Day_Suply','GE65_Tot_Benes','Year']]
                 .merge(drugs,'left',['Brnd_Name','Gnrc_Name'])
                 .drop(columns = ['Brnd_Name','Gnrc_Name'])
                )

prescriptions.to_csv('../Data/Outputs_Cleanup/Part_d/prescription_information.csv',single_file=True,index=False)

CPU times: user 20.9 s, sys: 1.21 s, total: 22.1 s
Wall time: 21.8 s


['/Users/jeff_1/Documents/DSE/DSE203/DSE-203-Knowledge-Graph/Data/Outputs_Cleanup/Part_d/prescription_information.csv']

In [14]:
#prescriptions.compute()
part_d.compute()

Unnamed: 0,Prscrbr_NPI,Prscrbr_Last_Org_Name,Prscrbr_First_Name,Prscrbr_City,Prscrbr_State_Abrvtn,Prscrbr_Type,Brnd_Name,Gnrc_Name,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Tot_Benes,GE65_Tot_Clms,GE65_Tot_30day_Fills,GE65_Tot_Drug_Cst,GE65_Tot_Day_Suply,GE65_Tot_Benes,Year
0,1003000837,JOHN,RICHARD,SANTA MARIA,CA,NURSE PRACTITIONER,ESCITALOPRAM OXALATE,ESCITALOPRAM OXALATE,54,58.0,544,407.28,,,,,,,2019
1,1003001363,STEVENS,CHARLES,EL CENTRO,CA,PAIN MANAGEMENT,ACETAMINOPHENCODEINE,ACETAMINOPHEN WITH CODEINE,31,31.0,879,629.18,13.0,,,,,,2019
2,1003001363,STEVENS,CHARLES,EL CENTRO,CA,PAIN MANAGEMENT,AMITIZA,LUBIPROSTONE,45,45.0,1350,17152.97,21.0,20.0,20.0,7683.43,600.0,,2019
3,1003001363,STEVENS,CHARLES,EL CENTRO,CA,PAIN MANAGEMENT,AMITRIPTYLINE HCL,AMITRIPTYLINE HCL,95,97.0,2883,1976.63,35.0,22.0,24.0,308.41,720.0,12.0,2019
4,1003001363,STEVENS,CHARLES,EL CENTRO,CA,PAIN MANAGEMENT,BACLOFEN,BACLOFEN,211,213.0,6390,4637.66,73.0,79.0,81.0,1642.86,2430.0,34.0,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165351,1992999551,MOLAI,INDIRA,ESCONDIDO,CA,INTERNAL MEDICINE,XARELTO,RIVAROXABAN,31,51.0,1450,22451.35,,,,,,,2019
165352,1992999551,MOLAI,INDIRA,ESCONDIDO,CA,INTERNAL MEDICINE,ZALEPLON,ZALEPLON,12,12.0,330,135.50,,12.0,12.0,135.50,330.0,,2019
165353,1992999551,MOLAI,INDIRA,ESCONDIDO,CA,INTERNAL MEDICINE,ZOLPIDEM TARTRATE,ZOLPIDEM TARTRATE,27,27.0,800,311.59,,27.0,27.0,311.59,800.0,,2019
165354,1992999569,YUEN,JENNY,SAN FRANCISCO,CA,OPTOMETRY,AZELASTINE HCL,AZELASTINE HCL,17,18.5,472,461.76,,17.0,18.5,461.76,472.0,,2019
