## Medicare Part D Data by Provider

Aggregate by Prscrbr_NPI

Keep prescriber NPI and info about name, location

aggregate other data into dictionaries


In [8]:
import requests
import pandas as pd
from collections import defaultdict
import time


### Test on 20K rows

In [7]:
def get_paginated_data(base_url, size=5000, max_records=5000):
    offset = 0
    all_data = []
    
    while offset < max_records:
        res = requests.get(f"{base_url}?size={size}&offset={offset}")
        data = res.json()
        
        if not data:
            break
        
        all_data.extend(data)
        offset += size

    return pd.DataFrame(all_data)

def aggregate_by_npi(data):
    # Dictionary to aggregate data per NPI
    aggregated_data = defaultdict(lambda: defaultdict(dict))
    
    for _, row in data.iterrows():
        npi = row['Prscrbr_NPI']
        
        # Add basic prescriber info (this remains unique per NPI)
        aggregated_data[npi]['Prscrbr_Last_Org_Name'] = row['Prscrbr_Last_Org_Name']
        aggregated_data[npi]['Prscrbr_First_Name'] = row['Prscrbr_First_Name']
        aggregated_data[npi]['Prscrbr_City'] = row['Prscrbr_City']
        aggregated_data[npi]['Prscrbr_State_Abrvtn'] = row['Prscrbr_State_Abrvtn']
        aggregated_data[npi]['Prscrbr_State_FIPS'] = row['Prscrbr_State_FIPS']
        aggregated_data[npi]['Prscrbr_Type'] = row['Prscrbr_Type']
        aggregated_data[npi]['Prscrbr_Type_Src'] = row['Prscrbr_Type_Src']
        
        # Aggregate prescription-related info as dictionaries with indexed keys
        index = len(aggregated_data[npi]['Brnd_Name']) + 1  # Start at 1, increment for each new entry
        
        aggregated_data[npi]['Brnd_Name'][index] = row['Brnd_Name']
        aggregated_data[npi]['Gnrc_Name'][index] = row['Gnrc_Name']
        aggregated_data[npi]['Tot_Clms'][index] = row['Tot_Clms']
        aggregated_data[npi]['Tot_30day_Fills'][index] = row['Tot_30day_Fills']
        aggregated_data[npi]['Tot_Day_Suply'][index] = row['Tot_Day_Suply']
        aggregated_data[npi]['Tot_Drug_Cst'][index] = row['Tot_Drug_Cst']
        aggregated_data[npi]['Tot_Benes'][index] = row['Tot_Benes']
        aggregated_data[npi]['GE65_Sprsn_Flag'][index] = row['GE65_Sprsn_Flag']
        aggregated_data[npi]['GE65_Tot_Clms'][index] = row['GE65_Tot_Clms']
        aggregated_data[npi]['GE65_Tot_30day_Fills'][index] = row['GE65_Tot_30day_Fills']
        aggregated_data[npi]['GE65_Tot_Drug_Cst'][index] = row['GE65_Tot_Drug_Cst']
        aggregated_data[npi]['GE65_Tot_Day_Suply'][index] = row['GE65_Tot_Day_Suply']
        aggregated_data[npi]['GE65_Bene_Sprsn_Flag'][index] = row['GE65_Bene_Sprsn_Flag']
        aggregated_data[npi]['GE65_Tot_Benes'][index] = row['GE65_Tot_Benes']
    
    # Convert the aggregated data to a DataFrame
    aggregated_df = pd.DataFrame.from_dict(aggregated_data, orient='index')
    return aggregated_df

In [9]:
# Base URL for the dataset (Medicare Part D Providers)
base_url = "https://data.cms.gov/data-api/v1/dataset/9552739e-3d05-4c1b-8eff-ecabf391e2e5/data"

# Measure the time taken
start_time = time.time()

# Step 1: Fetch the first 20,000 rows
data = get_paginated_data(base_url, size=5000, max_records=20000)

# Step 2: Aggregate data by NPI
aggregated_data = aggregate_by_npi(data)

# Step 3: Measure the end time
end_time = time.time()

# Step 4: Display the aggregated columns (prescription-related info)
aggregated_columns = [
    'Brnd_Name', 'Gnrc_Name', 'Tot_Clms', 'Tot_30day_Fills',
    'Tot_Day_Suply', 'Tot_Drug_Cst', 'Tot_Benes', 'GE65_Sprsn_Flag',
    'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst',
    'GE65_Tot_Day_Suply', 'GE65_Bene_Sprsn_Flag', 'GE65_Tot_Benes'
]

# Show the head of aggregated columns
print(aggregated_data[aggregated_columns].head())

# Step 5: Print the total time taken
print(f"Time taken to complete: {end_time - start_time} seconds")

           Prscrbr_Last_Org_Name Prscrbr_First_Name        Prscrbr_City  \
1003000126             Enkeshafi            Ardalan            Bethesda   
1003000142                Khalil             Rashid              Toledo   
1003000167               Escobar              Julio              Dayton   
1003000423               Velotta           Jennifer           Cleveland   
1003000480             Rothchild              Kevin              Aurora   
...                          ...                ...                 ...   
1003065178                Lowien             Nathan              Athens   
1003065475                Turner              Josef          Pittsburgh   
1003065509                Thomas             Nicole  Jacksonville Beach   
1003065582                Turner            Tiffany           Arlington   
1003065640                  Rowe            Anjalee            Portland   

           Prscrbr_State_Abrvtn Prscrbr_State_FIPS             Prscrbr_Type  \
1003000126          

### Get all Data



In [25]:
def get_paginated_data(base_url, size=1000000):
    offset = 0
    all_data = []
    
    while True:
        res = requests.get(f"{base_url}?size={size}&offset={offset}")
        data = res.json()
        
        if not data:
            break
        
        all_data.extend(data)
        offset += size
        print(f"Fetched {offset} records...")
    
    return pd.DataFrame(all_data)

def aggregate_by_npi(data):
    # Dictionary to store aggregated data by NPI using lists
    aggregated_data = defaultdict(lambda: [[] for _ in range(22)])  # Initialize 21 empty lists
    
    for _, row in data.iterrows():
        npi = row['Prscrbr_NPI']
        
        aggregated_data[npi][0] = npi  # Store NPI in the first list
        
        # Dynamically append to the lists for the other attributes
        aggregated_data[npi][1] = row.get('Prscrbr_Last_Org_Name', None)
        aggregated_data[npi][2] = row.get('Prscrbr_First_Name', None)
        aggregated_data[npi][3] = row.get('Prscrbr_City', None)
        aggregated_data[npi][4] = row.get('Prscrbr_State_Abrvtn', None)
        aggregated_data[npi][5] = row.get('Prscrbr_State_FIPS', None)
        aggregated_data[npi][6] = row.get('Prscrbr_Type', None)
        aggregated_data[npi][7] = row.get('Prscrbr_Type_Src', None)
        
        # Ensure each list grows properly for lists with multiple entries
        aggregated_data[npi][8].append(row.get('Brnd_Name', None))
        aggregated_data[npi][9].append(row.get('Gnrc_Name', None))
        aggregated_data[npi][10].append(row.get('Tot_Clms', None))
        aggregated_data[npi][11].append(row.get('Tot_30day_Fills', None))
        aggregated_data[npi][12].append(row.get('Tot_Day_Suply', None))
        aggregated_data[npi][13].append(row.get('Tot_Drug_Cst', None))
        aggregated_data[npi][14].append(row.get('Tot_Benes', None))
        aggregated_data[npi][15].append(row.get('GE65_Sprsn_Flag', None))
        aggregated_data[npi][16].append(row.get('GE65_Tot_Clms', None))
        aggregated_data[npi][17].append(row.get('GE65_Tot_30day_Fills', None))
        aggregated_data[npi][18].append(row.get('GE65_Tot_Drug_Cst', None))
        aggregated_data[npi][19].append(row.get('GE65_Tot_Day_Suply', None))
        aggregated_data[npi][20].append(row.get('GE65_Bene_Sprsn_Flag', None))
        aggregated_data[npi][21].append(row.get('GE65_Tot_Benes', None))
    
    # Convert the aggregated data into a DataFrame
    aggregated_df = pd.DataFrame.from_dict(aggregated_data, orient='index', 
                                           columns=[
                                               'Prscrbr_NPI', 'Prscrbr_Last_Org_Name', 'Prscrbr_First_Name', 
                                               'Prscrbr_City', 'Prscrbr_State_Abrvtn', 'Prscrbr_State_FIPS', 
                                               'Prscrbr_Type', 'Prscrbr_Type_Src', 'Brnd_Name', 'Gnrc_Name', 
                                               'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst', 
                                               'Tot_Benes', 'GE65_Sprsn_Flag', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 
                                               'GE65_Tot_Drug_Cst', 'GE65_Tot_Day_Suply', 'GE65_Bene_Sprsn_Flag', 
                                               'GE65_Tot_Benes'
                                           ])
    print(aggregated_df.head)
    
    return aggregated_df

In [26]:
# Base URL for the dataset (Medicare Part D Providers)
base_url = "https://data.cms.gov/data-api/v1/dataset/9552739e-3d05-4c1b-8eff-ecabf391e2e5/data"

# Measure the time taken
start_time = time.time()

# Step 1: Fetch all available rows
data = get_paginated_data(base_url)

# Step 2: Aggregate data by NPI
aggregated_data = aggregate_by_npi(data)

# Step 3: Measure the end time
end_time = time.time()

# Step 4: Display the aggregated columns (prescription-related info)
aggregated_columns = [
    'Brnd_Name', 'Gnrc_Name', 'Tot_Clms', 'Tot_30day_Fills',
    'Tot_Day_Suply', 'Tot_Drug_Cst', 'Tot_Benes', 'GE65_Sprsn_Flag',
    'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst',
    'GE65_Tot_Day_Suply', 'GE65_Bene_Sprsn_Flag', 'GE65_Tot_Benes'
]

# Show the head of aggregated columns
print(aggregated_data[aggregated_columns].head())

# Step 5: Print the total time taken
print(f"Time taken to complete: {end_time - start_time} seconds")

Fetched 1000000 records...
Fetched 2000000 records...
Fetched 3000000 records...
Fetched 4000000 records...
Fetched 5000000 records...
Fetched 6000000 records...
Fetched 7000000 records...
Fetched 8000000 records...
Fetched 9000000 records...
Fetched 10000000 records...
Fetched 11000000 records...
Fetched 12000000 records...
Fetched 13000000 records...
Fetched 14000000 records...
Fetched 15000000 records...
Fetched 16000000 records...
Fetched 17000000 records...
Fetched 18000000 records...
Fetched 19000000 records...
Fetched 20000000 records...
Fetched 21000000 records...
Fetched 22000000 records...
Fetched 23000000 records...
Fetched 24000000 records...
Fetched 25000000 records...
Fetched 26000000 records...
<bound method NDFrame.head of                    Npi Prscrbr_Last_Org_Name Prscrbr_First_Name Prscrbr_City  \
1003000126  1003000126             Enkeshafi            Ardalan     Bethesda   
1003000142  1003000142                Khalil             Rashid       Toledo   
1003000167 

In [41]:
print(len(data))

130000


In [24]:
output_file1 = "agg_cms_partD_byProvider.csv"
aggregated_data.to_csv(output_file1, index=False)

print(f"Data has been saved to {output_file1}")

Data has been saved to agg_cms_partD_byProvider1.csv
