In [3]:
import pandas as pd
import numpy as np
import time
import json
from typing import Iterable, Dict, Any, List, Optional
import requests
import networkx as nx


In [None]:
import requests

def _props_dict(item):
    props = {}
    plist = (item.get("propertyConceptList") or {}).get("propertyConcept") or []
    for p in plist:
        k = (p.get("propName") or "").strip()
        v = (p.get("propValue") or "").strip()
        if k:
            props[k.upper()] = v
    return props

def _first_match(props, *keys_or_contains):
    for key in keys_or_contains:
        if key in props:
            return props[key]
    for key in props:
        for needle in keys_or_contains:
            if needle in key:
                return props[key]
    return None

def _rxnorm_names_from_rxcui(rxcui):
    base = "https://rxnav.nlm.nih.gov/REST"
    url = f"{base}/rxcui/{rxcui}/allProperties.json"
    resp = requests.get(url, params={"prop": "names"}, timeout=10)
    resp.raise_for_status()
    data = resp.json() or {}
    props = (data.get("propConceptGroup") or {}).get("propConcept") or []
    names = [(p.get("propName"), p.get("propValue")) for p in props]

    brand = None
    for code, val in names:
        if (code or "").upper() in ("BN", "SBD", "SBDF", "SBDG") and val:
            brand = val
            break

    generic = None
    for code, val in names:
        if (code or "").upper() in ("IN", "SCD", "SCDF", "SCDC") and val:
            generic = val
            break

    if generic is None:
        url2 = f"{base}/rxcui/{rxcui}/property.json"
        r2 = requests.get(url2, params={"propName": "RxNorm Name"}, timeout=10)
        if r2.ok:
            dd = r2.json() or {}
            generic = (dd.get("propConceptGroup") or {}).get("propConcept", [{}])[0].get("propValue")

    return brand, generic

def get_names_for_ndc(ndc_code, ndcstatus="ALL"):
    """
    Returns a dict with:
      - ndc11
      - brand_name
      - product_type
      - generic_name
      - labeler
      - rxcui
    Uses RxCUI fallbacks to fill brand/generic when missing in NDC properties.
    """
    ndc_url = "https://rxnav.nlm.nih.gov/REST/ndcproperties.json"
    r = requests.get(ndc_url, params={"id": ndc_code, "ndcstatus": ndcstatus}, timeout=10)
    r.raise_for_status()
    data = r.json() or {}
    items = (data.get("ndcPropertyList") or {}).get("ndcProperty") or []
    if not items:
        return {
            "ndc11": None,
            "brand_name": None,
            "product_type": None,
            "generic_name": None,
            "labeler": None,
            "rxcui": None,
        }

    # pick richest item (prefer one with explicit proprietary/nonproprietary if present)
    best = None
    for it in items:
        props = _props_dict(it)
        if "PROPRIETARYNAME" in props or "NONPROPRIETARYNAME" in props:
            best = it
            break
    if best is None:
        best = items[0]

    props = _props_dict(best)
    rxcui = best.get("rxcui")
    ndc11 = best.get("ndcItem")  # RxNav’s NDC11 field

    # direct reads
    brand = _first_match(props, "PROPRIETARYNAME", "PROPRIETARY NAME", "PROPRIETARY")
    generic = _first_match(props, "NONPROPRIETARYNAME", "NONPROPRIETARY NAME", "NONPROPRIETARY")
    product_type = _first_match(props, "PRODUCTTYPENAME", "PRODUCT TYPE")
    labeler = _first_match(props, "LABELER", "LABELERNAME", "LABELER NAME")

    # fallbacks via RxCUI
    if (not brand or not generic) and rxcui:
        rx_brand, rx_generic = _rxnorm_names_from_rxcui(rxcui)
        brand = brand or rx_brand
        generic = generic or rx_generic

    return {
        "ndc11": ndc11,
        "brand_name": brand,
        "product_type": product_type,   # e.g., HUMAN PRESCRIPTION DRUG
        "generic_name": generic,
        "labeler": labeler,
        "rxcui": rxcui,
    }

# ---- Example usage
if __name__ == "__main__":
    summary = get_names_for_ndc("00003-0894", ndcstatus="ALL")

print("NDC11:", summary["ndc11"])
print("Brand name:", summary["brand_name"])
print("Product type:", summary["product_type"])
print("Generic name:", summary["generic_name"])
print("Labeler:", summary["labeler"])
print("RxCUI:", summary["rxcui"])


In [8]:
# Reading and exploring drug utilization data 
df = pd.read_csv('SDUD2017.csv')
df_xx=df[df['State'] == 'XX']
print(len(df))

#PC_idx=np.arange(len(df["NDC"].unique()))
#print(len(PC_idx))
# Too many unique NDC codes to handle all of them in a matrix like FAF dataset. 

# Remove invalid state code 'XX'
df = df[df['State'] != 'XX']
print(len(df))

# Grouping by 'State' and summing 'Units Reimbursed' and 'Number of Prescriptions'
state_summary = df.groupby('State')[['Units Reimbursed', 'Number of Prescriptions']].sum()
print(state_summary)

total_units = state_summary['Units Reimbursed'].sum()
total_prescriptions = state_summary['Number of Prescriptions'].sum()
print("\nTotal Units Reimbursed across all states:", total_units)
print("Total Number of Prescriptions across all states:", total_prescriptions)

# Sorting by 'Units Reimbursed' in descending order and displaying the top 10 states
top_states_units = state_summary.sort_values(by='Units Reimbursed', ascending=False).head(10)
print("\nTop 10 States by Units Reimbursed:")
print(top_states_units)

# Sorting by 'Number of Prescriptions' in descending order and displaying the top 10 states
top_states_prescriptions = state_summary.sort_values(by='Number of Prescriptions', ascending=False).head(10)
print("\nTop 10 States by Number of Prescriptions:")
print(top_states_prescriptions)

# print(df_xx)
print(len(df_xx))

wp = df_xx.groupby('State')[['Units Reimbursed', 'Number of Prescriptions']].sum()
print(wp)

  df = pd.read_csv('SDUD2017.csv')


4789471
4516042
       Units Reimbursed  Number of Prescriptions
State                                           
AK         9.388931e+07                1432713.0
AL         4.705494e+08                7510339.0
AR         3.042890e+08                5234571.0
AZ         9.279193e+08               17094220.0
CA         6.463579e+09               99026365.0
CO         5.270665e+08                8326556.0
CT         5.392237e+08                9329782.0
DC         1.177594e+08                2281402.0
DE         1.267123e+08                2420992.0
FL         1.980574e+09               30149124.0
GA         1.035827e+09               17282659.0
HI         3.101769e+09                2967089.0
IA         3.243602e+08                5892948.0
ID         1.350231e+08                2284366.0
IL         1.627276e+09               29255136.0
IN         1.022827e+09               16760723.0
KS         2.902815e+08                4520095.0
KY         1.279481e+09               23531086.0
LA  

In [18]:

#Trying a function to encapsulate the above logic for reusability

def preliminar(file_path):
    """
    Reads a CSV file, processes the drug utilization data, and prints summaries with the year.
    
    Args:
        file_path (str): Path to the CSV file.
    """
    # Extracting the year from the file path
    year = file_path[4:8]
    
    # Reading the CSV file
    df = pd.read_csv(file_path)
    print(f"Year {year}: Total rows in the dataset: {len(df)}")
    
    # Filtering rows with invalid state code 'XX'
    df_xx = df[df['State'] == 'XX']
    print(f"Year {year}: Rows with invalid state code 'XX': {len(df_xx)}")
    
    # Removing rows with invalid state code 'XX'
    df = df[df['State'] != 'XX']
    print(f"Year {year}: Rows after removing invalid state code 'XX': {len(df)}")
    
    # Grouping by 'State' and summing 'Units Reimbursed' and 'Number of Prescriptions'
    state_summary = df.groupby('State')[['Units Reimbursed', 'Number of Prescriptions']].sum()
    print(f"\nYear {year}: State Summary:")
    print(state_summary)
    
    # Calculating total units reimbursed and total prescriptions
    total_units = state_summary['Units Reimbursed'].sum()
    total_prescriptions = state_summary['Number of Prescriptions'].sum()
    print(f"\nYear {year}: Total Units Reimbursed across all states: {total_units}")
    print(f"Year {year}: Total Number of Prescriptions across all states: {total_prescriptions}")
    
    # Sorting by 'Units Reimbursed' in descending order and displaying the top 10 states
    top_states_units = state_summary.sort_values(by='Units Reimbursed', ascending=False).head(10)
    print(f"\nYear {year}: Top 10 States by Units Reimbursed:")
    print(top_states_units)
    
    # Sorting by 'Number of Prescriptions' in descending order and displaying the top 10 states
    top_states_prescriptions = state_summary.sort_values(by='Number of Prescriptions', ascending=False).head(10)
    print(f"\nYear {year}: Top 10 States by Number of Prescriptions:")
    print(top_states_prescriptions)
    
    # Summarizing rows with invalid state code 'XX'
    wp = df_xx.groupby('State')[['Units Reimbursed', 'Number of Prescriptions']].sum()
    print(f"\nYear {year}: Summary for rows with invalid state code 'XX':")
    print(wp)
    
    # Comparison between total_units and wp['Units Reimbursed']
    if not wp.empty:
        xx_units = wp['Units Reimbursed'].iloc[0] #index is correct
        xx_prescriptions = wp['Number of Prescriptions'].iloc[0]
        
        units_difference_percentage = ((total_units - xx_units) / xx_units) * 100
        prescriptions_difference_percentage = ((total_prescriptions - xx_prescriptions) / xx_prescriptions) * 100
        
        print(f"\nYear {year}: Comparison with 'XX':")
        print(f"Percentage difference in Units Reimbursed: {units_difference_percentage:.2f}%")
        print(f"Percentage difference in Number of Prescriptions: {prescriptions_difference_percentage:.2f}%")
    else:
        print(f"\nYear {year}: No data available for state code 'XX'.")
    
    # Correlation Analysis
    correlation = state_summary['Units Reimbursed'].corr(state_summary['Number of Prescriptions'])
    print(f"\nYear {year}: Correlation between 'Units Reimbursed' and 'Number of Prescriptions': {correlation:.2f}")

# Example usage:
preliminar('SDUD2017.csv')
preliminar('SDUD2018.csv')
preliminar('SDUD2019.csv')
preliminar('SDUD2020.csv')
preliminar('SDUD2021.csv')
preliminar('SDUD2022.csv')
preliminar('SDUD2023.csv')
preliminar('SDUD2024.csv')


  df = pd.read_csv(file_path)


Year 2017: Total rows in the dataset: 4789471
Year 2017: Rows with invalid state code 'XX': 273429
Year 2017: Rows after removing invalid state code 'XX': 4516042

Year 2017: State Summary:
       Units Reimbursed  Number of Prescriptions
State                                           
AK         9.388931e+07                1432713.0
AL         4.705494e+08                7510339.0
AR         3.042890e+08                5234571.0
AZ         9.279193e+08               17094220.0
CA         6.463579e+09               99026365.0
CO         5.270665e+08                8326556.0
CT         5.392237e+08                9329782.0
DC         1.177594e+08                2281402.0
DE         1.267123e+08                2420992.0
FL         1.980574e+09               30149124.0
GA         1.035827e+09               17282659.0
HI         3.101769e+09                2967089.0
IA         3.243602e+08                5892948.0
ID         1.350231e+08                2284366.0
IL         1.627276e+09   

In [None]:
#Continue

