In [6]:
import pandas as pd
import numpy as np

In [28]:
# Read the CSV file with NDC column as string to preserve leading zeros
csv_file = r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2024.csv"
print("Reading CSV file with proper dtype to preserve leading zeros...")

# Force NDC column to be read as string/object to preserve leading zeros
df = pd.read_csv(csv_file, dtype={'NDC': 'object'})

# Filter out rows where State column equals 'XX'
print(f"Total rows before filtering: {len(df)}")
df_filtered = df[df['State'] != 'XX']
print(f"Total rows after filtering out State='XX': {len(df_filtered)}")
print(f"Rows removed by State filter: {len(df) - len(df_filtered)}")

# Filter out rows with NA values in 'Units Reimbursed' or 'Number of Prescriptions'
before_na_filter = len(df_filtered)
df_filtered = df_filtered.dropna(subset=['Units Reimbursed', 'Number of Prescriptions'])
print(f"Total rows after removing NA values in Units/Prescriptions: {len(df_filtered)}")
#print(f"Rows removed by NA filter: {before_na_filter - len(df_filtered)}")

# Extract NDC column from filtered data - it's already in string format now
ndc_values = df_filtered['NDC']

# Save to text file
output_file = r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\NDCf_2024.txt"
print(f"Extracting {len(ndc_values)} NDC values with leading zeros preserved (excluding State='XX' and NA values)...")

with open(output_file, 'w') as f:
    for ndc in ndc_values:
        f.write(str(ndc) + '\n')

print(f"Successfully saved {len(ndc_values)} NDC values to NDC_2024_filtered.txt (State != 'XX', no NA values)")
print(f"Unique NDC values: {ndc_values.nunique()}")
print("First 10 NDC values (with leading zeros):")
print(ndc_values.head(10).tolist())

Reading CSV file with proper dtype to preserve leading zeros...
Total rows before filtering: 5205065
Total rows after filtering out State='XX': 4888046
Rows removed by State filter: 317019
Total rows after removing NA values in Units/Prescriptions: 2362630
Extracting 2362630 NDC values with leading zeros preserved (excluding State='XX' and NA values)...
Successfully saved 2362630 NDC values to NDC_2024_filtered.txt (State != 'XX', no NA values)
Unique NDC values: 33397
First 10 NDC values (with leading zeros):
['00002143380', '00002143480', '00002143611', '00002144511', '00002145780', '00002146080', '00002147180', '00002148480', '00002149580', '00002150680']


In [31]:
# read ATC4 mapping, ensure NDC is read as string and zero-padded to 11 digits
file_path=r'C:\Users\asus\OneDrive - purdue.edu\VS code\Data\ATC\2024_ATC4_classes.csv'
df_atc4 = pd.read_csv(file_path, dtype={'NDC': 'object'})
df_atc4['NDC'] = df_atc4['NDC'].str.zfill(11)

print(df_atc4.columns)
print("Unique NDCs", df_atc4['NDC'].nunique())
print("Total rows", len(df_atc4))
top_classes = df_atc4['ATC4 Class'].value_counts()
print(top_classes.head(10))
print(f"Filtered dataset has {len(df_filtered)} rows")

# Merge filtered SDUD data with ATC4 mapping on NDC

merged_df=pd.merge(df_filtered, df_atc4, on='NDC', how='left')
print(f"Merged dataframe has {len(merged_df)} rows")
print(f"Unique NDCs in merged dataframe: {merged_df['NDC'].nunique()}")

atc_counts = df_atc4['NDC'].value_counts()
print(atc_counts.value_counts().head(10))


Index(['NDC', 'ATC4 Class'], dtype='object')
Unique NDCs 32204
Total rows 61470
ATC4 Class
H02AB    894
N06BA    762
S01BA    757
C05AA    746
N06AX    746
B05XA    723
R01AD    718
N03AX    702
S01AA    698
C10AA    655
Name: count, dtype: int64
Filtered dataset has 2362630 rows
Merged dataframe has 4360194 rows
Unique NDCs in merged dataframe: 33397
count
1     21590
2      4098
3      3168
4       911
5       718
7       523
11      270
8       267
6       241
9       179
Name: count, dtype: int64
