In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the CSV file with NDC column as string to preserve leading zeros
csv_file = r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2024.csv"
print("Reading CSV file with proper dtype to preserve leading zeros...")

# Force NDC column to be read as string/object to preserve leading zeros
df = pd.read_csv(csv_file, dtype={'NDC': 'object'})

# Filter out rows where State column equals 'XX'
print(f"Total rows before filtering: {len(df)}")
df_filtered = df[df['State'] != 'XX']
print(f"Total rows after filtering out State='XX': {len(df_filtered)}")
print(f"Rows removed by State filter: {len(df) - len(df_filtered)}")

# Filter out rows with NA values in 'Units Reimbursed' or 'Number of Prescriptions'
before_na_filter = len(df_filtered)
df_filtered = df_filtered.dropna(subset=['Units Reimbursed', 'Number of Prescriptions'])
print(f"Total rows after removing NA values in Units/Prescriptions: {len(df_filtered)}")
#print(f"Rows removed by NA filter: {before_na_filter - len(df_filtered)}")

# Extract NDC column from filtered data - it's already in string format now
ndc_values = df_filtered['NDC']

# Save to text file
output_file = r"c:\Users\asus\OneDrive - purdue.edu\VS code\Data\SDUD\NDC_2024_filtered.txt"
print(f"Extracting {len(ndc_values)} NDC values with leading zeros preserved (excluding State='XX' and NA values)...")

with open(output_file, 'w') as f:
    for ndc in ndc_values:
        f.write(str(ndc) + '\n')

print(f"Successfully saved {len(ndc_values)} NDC values to NDC_2024_filtered.txt (State != 'XX', no NA values)")
print(f"Unique NDC values: {ndc_values.nunique()}")
print("First 10 NDC values (with leading zeros):")
print(ndc_values.head(10).tolist())