In [2]:
import pandas as pd
import numpy as np

In [4]:
# Read the CSV file with NDC column as string to preserve leading zeros
year = 2024
csv_file = fr"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD{year}.csv"

#Personal pc path: r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD{year}.csv"
#Office path: "C:\Users\lholguin\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD{year}.csv"

#personal pc: r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2024.csv"
print("Reading CSV file with proper dtype to preserve leading zeros...")

# Force NDC column to be read as string/object to preserve leading zeros
df = pd.read_csv(csv_file, dtype={'NDC': 'object'})

# Filter out rows with NA values in 'Units Reimbursed' or 'Number of Prescriptions' first
print(rf"Total rows in {year} before filtering: {len(df)}")
df_filtered = df.dropna(subset=['Units Reimbursed', 'Number of Prescriptions'])
print(f"Total rows after removing NA values in Units/Prescriptions: {len(df_filtered)}")
print(f"Rows removed by NA filter: {len(df) - len(df_filtered)}")

# Then filter out rows where State column equals 'XX'
before_state_filter = len(df_filtered)
df_filtered = df_filtered[df_filtered['State'] != 'XX']
print(f"Total rows after filtering out State='XX': {len(df_filtered)}")
print(f"Rows removed by State filter: {before_state_filter - len(df_filtered)}")
print("Unique NDCs", df_filtered['NDC'].nunique())


Reading CSV file with proper dtype to preserve leading zeros...
Total rows in 2024 before filtering: 5205065
Total rows after removing NA values in Units/Prescriptions: 2599748
Rows removed by NA filter: 2605317
Total rows after filtering out State='XX': 2362630
Rows removed by State filter: 237118
Unique NDCs 33397


In [None]:
#Generating the txt file with the NDC values
# Save to text file

# Extract NDC column from filtered data - it's already in string format now
ndc_values = df_filtered['NDC']
output_file = rf"C:\Users\lholguin\OneDrive - purdue.edu\VS code\Data\ATC\NDCf_{year}.txt"
print(f"Extracting {len(ndc_values)} NDC values with leading zeros preserved (excluding State='XX' and NA values)...")

with open(output_file, 'w') as f:
    for ndc in ndc_values:
        f.write(str(ndc) + '\n')

print(f"Successfully saved {len(ndc_values)} NDC values to NDCf_{year}.txt (State != 'XX', no NA values)")
print(f"Unique NDC values: {ndc_values.nunique()}")
print("First 10 NDC values (with leading zeros):")
print(ndc_values.head(10).tolist())

In [10]:
# read ATC4 mapping, ensure NDC is read as string and zero-padded to 11 digits
file_path=rf'C:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\NDCf_{year}_ATC4_classes.csv'
#Personal pc path r'C:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\NDCf_{year}_ATC4_classes.csv'
#office path: r"C:\Users\lholguin\OneDrive - purdue.edu\VS code\Data\ATC\NDCf_{year}_ATC4_classes.csv"
df_atc4 = pd.read_csv(file_path, dtype={'NDC': 'object'})
df_atc4['NDC'] = df_atc4['NDC'].str.zfill(11)
#How many unique atc4 classes are there in the mapping file?
print(f"Unique ATC4 Classes in ATC4 mapping file: {df_atc4['ATC4 Class'].nunique()}")

print(df_atc4.columns)
print( rf"Unique NDCs of ATC csv {year}:", df_atc4['NDC'].nunique())
print("Total rows", len(df_atc4))
top_classes = df_atc4['ATC4 Class'].value_counts()
print(top_classes.head(10))
print(f"Filtered dataset has {len(df_filtered)} rows")

# Merge filtered SDUD data with ATC4 mapping on NDC

merged_df=pd.merge(df_filtered, df_atc4, on='NDC', how='left')
print(f"Merged dataframe has {len(merged_df)} rows")
print(f"Unique NDCs in merged dataframe: {merged_df['NDC'].nunique()}")
atc_counts = df_atc4['NDC'].value_counts()
print(atc_counts.value_counts().head(10))


Unique ATC4 Classes in ATC4 mapping file: 612
Index(['NDC', 'ATC4 Class'], dtype='object')
Unique NDCs of ATC csv 2024: 32204
Total rows 61470
ATC4 Class
H02AB    894
N06BA    762
S01BA    757
C05AA    746
N06AX    746
B05XA    723
R01AD    718
N03AX    702
S01AA    698
C10AA    655
Name: count, dtype: int64
Filtered dataset has 2362630 rows
Merged dataframe has 4360194 rows
Unique NDCs in merged dataframe: 33397
count
1     21590
2      4098
3      3168
4       911
5       718
7       523
11      270
8       267
6       241
9       179
Name: count, dtype: int64


In [8]:
# Looking at the unique ndc codes that does not have an atc4 class
missing_atc = merged_df[merged_df['ATC4 Class'].isna()]
print(f"NDCs without ATC4 Class: {missing_atc['NDC'].nunique()}")
print(missing_atc['NDC'].head(10).tolist())
#Let's print those NDC codes to a text file
missing_ndc_file = rf"C:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\NDC_missing_{year}.txt"
with open(missing_ndc_file, 'w') as f:
    for ndc in missing_atc['NDC'].unique():
        f.write(str(ndc) + '\n')
print(f"Saved missing NDCs to {missing_ndc_file}")

#Looking at the unique atc4 classes in the merged dataframe
print(f"Unique ATC4 Classes in merged dataframe: {merged_df['ATC4 Class'].nunique()}")


NDCs without ATC4 Class: 1193
['00069034405', '00069246510', '00264785000', '00469650189', '00527143501', '00536408501', '00574030216', '00574030316', '00591292754', '13811052501']
Saved missing NDCs to C:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\NDC_missing_2024.txt
Unique ATC4 Classes in merged dataframe: 612


In [8]:
#Note that using the mapping atc5 for missing ndc is a little bit more detailed. Let's analyze it
miss_path=rf'C:\Users\lholguin\OneDrive - purdue.edu\VS code\Data\ATC\NDC_missing_{year}_ATC5_classes.csv'
# C:\Users\lholguin\OneDrive - purdue.edu\VS code\Data\ATC\NDC_missing_2024_ATC5_classes.csv
df_miss=pd.read_csv(miss_path)
print(df_miss.head())
Rxcui_no = df_miss[df_miss['ATC5 Class'] == 'No RxCUI Found'].shape[0]
ATClass_no = df_miss[df_miss['ATC5 Class'] == 'No ATC Mapping Found'].shape[0]
print(f"Unique RxCUI in missing NDC mapping: {Rxcui_no}")
print(f"Unique ATC5 Class in missing NDC mapping: {ATClass_no}")

           NDC            ATC5 Class
0  70756061270  No ATC Mapping Found
1    904020261        No RxCUI Found
2  52544063028        No RxCUI Found
3  67919001101        No RxCUI Found
4  13668001830        No RxCUI Found
Unique RxCUI in missing NDC mapping: 1674
Unique ATC5 Class in missing NDC mapping: 266
