In [None]:
import pandas as pd
import numpy as np
import requests
import shelve
import os
from datetime import datetime
import matplotlib.pyplot as plt
import re

In [None]:
user="Lilian"
#user in personal pc1 <- "asus"

In [None]:
class NDCATCAnalyzer:

    def __init__(self, year, base_path=None):
        self.year = year
        self.base_path = base_path or rf"c:\Users\{user}\OneDrive - purdue.edu\VS code\Data"
        self.df_cleaned = None
        self.df_merged = None
        self.atc_mapping = None
        
    def clean_sdud_data(self):

        csv_file = os.path.join(self.base_path, f"SDUD\\SDUD{self.year}.csv")
        print(f"Reading CSV: {csv_file}")
        
        df = pd.read_csv(csv_file, dtype={'NDC': 'object'})
        print(f"Initial rows: {len(df):,}")
        
        # Filter data
        df_filtered = df.dropna(subset=['Units Reimbursed', 'Number of Prescriptions'])
        df_filtered = df_filtered[df_filtered['State'] != 'XX']
        
        print(f"After cleaning: {len(df_filtered):,} rows, {df_filtered['NDC'].nunique():,} unique NDCs")
        
        self.df_cleaned = df_filtered
        return self.df_cleaned
    
    def adding_key(self):
        if self.df_cleaned is None:
            raise ValueError("Run clean_sdud_data() first")
        
        self.df_cleaned['record_id'] = (
            self.df_cleaned['State'].astype(str) + "_" +
            self.df_cleaned['Year'].astype(str) + "_" +
            self.df_cleaned['Quarter'].astype(str) + "_" +
            self.df_cleaned['Utilization Type'].astype(str) + "_" +
            self.df_cleaned['NDC'].astype(str)
        )
        
        print(f"Created {len(self.df_cleaned):,} record IDs")
        return self.df_cleaned
    
    def generate_ndc_txt(self, output_filename=None):
        if 'record_id' not in self.df_cleaned.columns:
            raise ValueError("Run adding_key() first")
            
        output_filename = output_filename or f"NDCNEW_{self.year}.txt"
        output_path = os.path.join(self.base_path, f"ATC\\text_files\\{output_filename}")
        
        unique_pairs = self.df_cleaned[['NDC', 'record_id']].drop_duplicates()
        
        with open(output_path, 'w') as f:
            f.write("NDC\trecord_id\n")
            for _, row in unique_pairs.iterrows():
                f.write(f"{row['NDC']}\t{row['record_id']}\n")
        
        print(f"Exported {unique_pairs['record_id'].nunique():,} unique records to {output_path}")
        return output_path
    
    def analyze_atc4_mapping(self):

        if 'record_id' not in self.df_cleaned.columns:
            raise ValueError("Run adding_key() first")
            
        atc4_path = os.path.join(self.base_path, f"ATC\\ATC4_classes\\NDCNEW_{self.year}_ATC4_classes.csv")
        
        # Load ATC4 mapping
        df_atc4 = pd.read_csv(atc4_path, dtype={'NDC': 'object', 'record_id': 'string'})
        df_atc4['NDC'] = df_atc4['NDC'].str.zfill(11)
        
        print(f"ATC4 file: {len(df_atc4):,} rows, {df_atc4['NDC'].nunique():,} unique NDCs")
        
        # Ensure consistent types
        self.df_cleaned['record_id'] = self.df_cleaned['record_id'].astype('string')
        self.df_cleaned['NDC'] = self.df_cleaned['NDC'].astype('object')
        
        # Merge on both record_id and NDC
        self.atc_mapping = pd.merge(
            self.df_cleaned,
            df_atc4[['record_id', 'NDC', 'ATC4 Class']],
            on=['record_id', 'NDC'],
            how='left'
        )
        
        total = len(self.atc_mapping)
        mapped = self.atc_mapping['ATC4 Class'].notna().sum()
        print(f"Merged: {total:,} records, {mapped:,} with ATC4 ({mapped/total*100:.1f}%)")
        
        missing = total - mapped
        if missing > 0:
            print(f"Missing: {missing:,} records, {self.atc_mapping[self.atc_mapping['ATC4 Class'].isna()]['NDC'].nunique():,} unique NDCs")
        
        return self.atc_mapping
    
    def analyze_atc_distribution(self, level='ATC3'):
        """Analyze distribution of ATC classes per record_id for any level (ATC2, ATC3, ATC4)."""
        if self.atc_mapping is None:
            raise ValueError("Run analyze_atc4_mapping() first")
        
        records = self.atc_mapping[self.atc_mapping['ATC4 Class'].notna()].copy()
        
        if len(records) == 0:
            print("No records with valid ATC4 mappings.")
            return None
        
        # Create ATC level column if needed
        if level == 'ATC3':
            records['ATC3 Class'] = records['ATC4 Class'].str[:4]
            class_col = 'ATC3 Class'
        elif level == 'ATC2':
            records['ATC2 Class'] = records['ATC4 Class'].str[:3]
            class_col = 'ATC2 Class'
        else:
            class_col = 'ATC4 Class'
        
        # Count classes per record_id
        per_record = records.groupby('record_id')[class_col].nunique().reset_index()
        per_record.columns = ['record_id', 'num_classes']
        
        distribution = per_record['num_classes'].value_counts().sort_index()
        
        print(f"\n{level} CLASSES PER RECORD_ID:")
        for n_classes, count in distribution.items():
            pct = (count / len(per_record)) * 100
            print(f"  {n_classes} class(es): {count:,} records ({pct:.1f}%)")
        
        print(f"\nSummary:")
        print(f"  Avg {level} per record: {per_record['num_classes'].mean():.2f}")
        print(f"  Max {level} per record: {per_record['num_classes'].max()}")
        
        return per_record

    def fetch_atc_names(self, cache_path=None):
        """Fetch ATC class names (ATC4, ATC3, ATC2) from RxNav API."""
        if self.atc_mapping is None:
            raise ValueError("Must run analyze_atc4_mapping() first")
        
        if cache_path is None:
            cache_path = os.path.join(self.base_path, "ATC\\cache_files\\atc_names_cache")
        
        print(f"\n{'='*60}")
        print("FETCHING ATC CLASS NAMES")
        print(f"{'='*60}")
        print(f"Using cache: {cache_path}")
        
        # Get only records with valid ATC4 mappings
        df_with_atc = self.atc_mapping[self.atc_mapping['ATC4 Class'].notna()].copy()
        
        # Create ATC3 and ATC2 columns from ATC4
        print("\nCreating ATC3 and ATC2 columns from ATC4...")
        df_with_atc['ATC3 Class'] = df_with_atc['ATC4 Class'].str[:4]
        df_with_atc['ATC2 Class'] = df_with_atc['ATC4 Class'].str[:3]
        
        # Get unique codes for each level
        unique_atc4 = df_with_atc['ATC4 Class'].dropna().unique()
        unique_atc3 = df_with_atc['ATC3 Class'].dropna().unique()
        unique_atc2 = df_with_atc['ATC2 Class'].dropna().unique()
        
        # Filter out invalid codes
        unique_atc4 = [c for c in unique_atc4 if c not in ['No ATC Mapping Found', 'No RxCUI Found', '']]
        unique_atc3 = [c for c in unique_atc3 if c not in ['No ATC Mapping Found', 'No RxCUI Found', '', 'No ', 'No']]
        unique_atc2 = [c for c in unique_atc2 if c not in ['No ATC Mapping Found', 'No RxCUI Found', '', 'No ', 'No']]
        
        print(f"\nUnique codes to fetch:")
        print(f"  ATC4: {len(unique_atc4)}")
        print(f"  ATC3: {len(unique_atc3)}")
        print(f"  ATC2: {len(unique_atc2)}")
        
        # Build mappings
        atc4_names = {}
        atc3_names = {}
        atc2_names = {}
        
        with shelve.open(cache_path) as cache:
            start_time = datetime.now()
            
            print("\nFetching ATC4 names...")
            for code in unique_atc4:
                atc4_names[code] = self._get_atc_name(code, cache)
            
            print("Fetching ATC3 names...")
            for code in unique_atc3:
                atc3_names[code] = self._get_atc_name(code, cache)
            
            print("Fetching ATC2 names...")
            for code in unique_atc2:
                atc2_names[code] = self._get_atc_name(code, cache)
            
            print(f"\nTotal processing time: {(datetime.now() - start_time).total_seconds()/60:.1f} minutes")
        
        # Apply names to all records in atc_mapping
        print("\nApplying names to dataframe...")
        self.atc_mapping['ATC3 Class'] = self.atc_mapping['ATC4 Class'].str[:4]
        self.atc_mapping['ATC2 Class'] = self.atc_mapping['ATC4 Class'].str[:3]
        
        self.atc_mapping['ATC4_Name'] = self.atc_mapping['ATC4 Class'].map(atc4_names).fillna('')
        self.atc_mapping['ATC3_Name'] = self.atc_mapping['ATC3 Class'].map(atc3_names).fillna('')
        self.atc_mapping['ATC2_Name'] = self.atc_mapping['ATC2 Class'].map(atc2_names).fillna('')
        
        print(f"\nATC names added successfully!")
        print("\nSample output:")
        sample = self.atc_mapping[self.atc_mapping['ATC4 Class'].notna()][['NDC', 'record_id', 'ATC4 Class', 'ATC4_Name', 'ATC3 Class', 'ATC3_Name', 'ATC2 Class', 'ATC2_Name']].head(5)
        print(sample.to_string())
        
        return self.atc_mapping
    
    def prepare_final_dataframe(self):
        """Scale units to billions/millions and create final dataframe."""
        if self.atc_mapping is None:
            raise ValueError("Run fetch_atc_names() first")
        
        self.df_merged = self.atc_mapping.copy()
        
        # Scale units
        self.df_merged['Units Reimbursed'] = self.df_merged['Units Reimbursed'] / 1e9
        self.df_merged['Number of Prescriptions'] = self.df_merged['Number of Prescriptions'] / 1e6
        
        total = len(self.df_merged)
        mapped = self.df_merged['ATC4 Class'].notna().sum()
        
        print(f"\nFinal Statistics:")
        print(f"  Records: {total:,} ({mapped:,} with ATC4, {mapped/total*100:.1f}%)")
        print(f"  Units Reimbursed: {self.df_merged['Units Reimbursed'].sum():.2f} Billion")
        print(f"  Prescriptions: {self.df_merged['Number of Prescriptions'].sum():.2f} Million")
        
        return self.df_merged
    
    def _get_atc_name(self, atc_code, cache):
        """Helper: Fetch ATC name from RxNav API with caching."""
        cache_key = f"atc_name:{atc_code}"
        if cache_key in cache:
            return cache[cache_key]
        
        try:
            url = f"https://rxnav.nlm.nih.gov/REST/rxclass/class/byId.json?classId={atc_code}"
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            
            if 'rxclassMinConceptList' in data and 'rxclassMinConcept' in data['rxclassMinConceptList']:
                concepts = data['rxclassMinConceptList']['rxclassMinConcept']
                if concepts:
                    name = concepts[0].get('className', '')
                    cache[cache_key] = name
                    return name
            
            cache[cache_key] = ''
            return ''
            
        except Exception as e:
            print(f"Error retrieving {atc_code}: {e}")
            cache[cache_key] = ''
            return ''

    def export_merged_data(self, output_filename=None, show_details=True):
        """Export deduplicated final dataset to CSV."""
        if self.df_merged is None:
            raise ValueError("Run prepare_final_dataframe() first")
            
        output_filename = output_filename or f"merged_NEWdata_{self.year}.csv"
        output_path = os.path.join(self.base_path, f"ATC\\merged_data\\{output_filename}")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Check duplicates
        initial_count = len(self.df_merged)
        duplicate_count = self.df_merged['record_id'].duplicated().sum()
        
        print(f"\nDeduplication Check:")
        print(f"  Before: {initial_count:,} rows")
        print(f"  Duplicates: {duplicate_count:,}")
        
        # Show sample duplicates if requested
        if show_details and duplicate_count > 0:
            dup_records = self.df_merged[self.df_merged['record_id'].duplicated(keep=False)].sort_values('record_id')
            sample_ids = dup_records['record_id'].unique()[:2]
            
            print(f"\nSample duplicate record_ids:")
            for rid in sample_ids:
                sample = self.df_merged[self.df_merged['record_id'] == rid][
                    ['record_id', 'NDC', 'State', 'ATC4 Class', 'ATC2 Class']
                ]
                print(f"\n{rid}:")
                print(sample.to_string(index=False))
        
        # Deduplicate and export
        df_final = self.df_merged.drop_duplicates(subset='record_id', keep='first')
        df_final.to_csv(output_path, index=False)
        
        print(f"\n  After: {len(df_final):,} rows")
        print(f"  Removed: {initial_count - len(df_final):,}")
        print(f"\nExported to: {output_path}")
        
        # Aggregate metrics
        agg = df_final.groupby('record_id').agg({
            'Units Reimbursed': 'sum',
            'Number of Prescriptions': 'sum'
        })
        
        print(f"\nAggregated Totals:")
        print(f"  Units Reimbursed: {agg['Units Reimbursed'].sum():.3f} Billion")
        print(f"  Number of Prescriptions: {agg['Number of Prescriptions'].sum():3f} Million")
        
        return output_path
    
    @staticmethod
    def create_multi_year_distribution_analysis(years_list):
        """Analyze ATC distribution percentages across multiple years."""
        print("Creating Multi-Year ATC Distribution Analysis...")
        print("="*70)
        
        results = {
            'ATC4_1_class': {}, 'ATC4_2_classes': {}, 'ATC4_3+_classes': {},
            'ATC3_1_class': {}, 'ATC3_2_classes': {}, 'ATC3_3+_classes': {},
            'ATC2_1_class': {}, 'ATC2_2_classes': {}, 'ATC2_3+_classes': {}
        }
        
        for year in years_list:
            print(f"Processing {year}...", end=" ")
            try:
                analyzer = NDCATCAnalyzer(year=year)
                analyzer.clean_sdud_data()
                analyzer.adding_key()
                analyzer.analyze_atc4_mapping()
                
                records = analyzer.atc_mapping[analyzer.atc_mapping['ATC4 Class'].notna()].copy()
                records['ATC2 Class'] = records['ATC4 Class'].str[:3]
                records['ATC3 Class'] = records['ATC4 Class'].str[:4]
                
                # Calculate distributions for each level
                for level, col in [('ATC4', 'ATC4 Class'), ('ATC3', 'ATC3 Class'), ('ATC2', 'ATC2 Class')]:
                    per_record = records.groupby('record_id')[col].nunique()
                    dist = per_record.value_counts().sort_index()
                    total = len(per_record)
                    
                    results[f'{level}_1_class'][year] = f"{(dist.get(1, 0) / total * 100):.1f}%"
                    results[f'{level}_2_classes'][year] = f"{(dist.get(2, 0) / total * 100):.1f}%"
                    results[f'{level}_3+_classes'][year] = f"{(dist[dist.index >= 3].sum() / total * 100):.1f}%"
                
                print("✓")
            except Exception as e:
                print(f"✗ Error: {e}")
                for key in results.keys():
                    results[key][year] = "N/A"
        
        df_percentages = pd.DataFrame(results).T
        print(f"\nATC DISTRIBUTION PERCENTAGES ACROSS YEARS")
        print("="*60)
        print(df_percentages)
        return df_percentages
    
    @staticmethod
    def analyze_general_atc_overview(years_list):

        print("Creating ATC2 & ATC3 Overview: Unique NDCs per Class Across Years...")
        print("="*78)
        
        atc2_year_results = {}
        atc3_year_results = {}
        
        for year in years_list:
            print(f"Processing {year}...", end=" ")
            try:
                analyzer = NDCATCAnalyzer(year=year)
                analyzer.clean_sdud_data()
                analyzer.adding_key()
                analyzer.analyze_atc4_mapping()
                
                records = analyzer.atc_mapping[analyzer.atc_mapping['ATC4 Class'].notna()].copy()
                if records.empty:
                    print("No records with ATC mapping")
                    atc2_year_results[year] = pd.DataFrame()
                    atc3_year_results[year] = pd.DataFrame()
                    continue
                
                records['ATC2 Class'] = records['ATC4 Class'].str[:3]
                records['ATC3 Class'] = records['ATC4 Class'].str[:4]
                
                # ATC2 summary
                pairs2 = records[['record_id', 'NDC', 'ATC2 Class']].drop_duplicates()
                atc2_summary = pairs2.groupby('ATC2 Class').agg(
                    Unique_NDCs=('NDC', 'nunique'),
                    Total_Records=('record_id', 'nunique')
                ).sort_values('Unique_NDCs', ascending=False)
                atc2_summary['Percentage_of_NDCs'] = (
                    atc2_summary['Unique_NDCs'] / pairs2['NDC'].nunique() * 100
                ).round(1)
                
                # ATC3 summary
                pairs3 = records[['record_id', 'NDC', 'ATC3 Class']].drop_duplicates()
                atc3_summary = pairs3.groupby('ATC3 Class').agg(
                    Unique_NDCs=('NDC', 'nunique'),
                    Total_Records=('record_id', 'nunique')
                ).sort_values('Unique_NDCs', ascending=False)
                atc3_summary['Percentage_of_NDCs'] = (
                    atc3_summary['Unique_NDCs'] / pairs3['NDC'].nunique() * 100
                ).round(1)
                
                atc2_year_results[year] = atc2_summary
                atc3_year_results[year] = atc3_summary
                
                print(f"✓ (ATC2: {len(atc2_summary)} classes, {pairs2['NDC'].nunique():,} NDCs; "
                      f"ATC3: {len(atc3_summary)} classes, {pairs3['NDC'].nunique():,} NDCs)")
            except Exception as e:
                print(f"✗ Error: {e}")
                atc2_year_results[year] = pd.DataFrame()
                atc3_year_results[year] = pd.DataFrame()
        
        # Print summaries
        print("\nUNIQUE NDCs PER ATC2 CLASS BY YEAR")
        print("="*60)
        for year in years_list:
            if not atc2_year_results[year].empty:
                print(f"\n{year}: {len(atc2_year_results[year])} classes, "
                      f"{atc2_year_results[year]['Unique_NDCs'].sum():,} total NDCs")
                print("Top 10:")
                print(atc2_year_results[year].head(10))
        
        print("\nUNIQUE NDCs PER ATC3 CLASS BY YEAR")
        print("="*60)
        for year in years_list:
            if not atc3_year_results[year].empty:
                print(f"\n{year}: {len(atc3_year_results[year])} classes, "
                      f"{atc3_year_results[year]['Unique_NDCs'].sum():,} total NDCs")
                print("Top 10:")
                print(atc3_year_results[year].head(10))
        
        # Build comparison tables
        def build_comparison(year_tables):
            all_classes = set()
            for tbl in year_tables.values():
                if not tbl.empty:
                    all_classes.update(tbl.index.tolist())
            comp = {cls: {y: int(year_tables[y].loc[cls, 'Unique_NDCs']) 
                          if not year_tables[y].empty and cls in year_tables[y].index else 0
                          for y in years_list}
                    for cls in sorted(all_classes)}
            df = pd.DataFrame(comp).T
            return df.loc[df.sum(axis=1).sort_values(ascending=False).index]
        
        atc2_comparison = build_comparison(atc2_year_results)
        atc3_comparison = build_comparison(atc3_year_results)
        
        # Create cumulative frequency tables
        def create_cumulative_frequency_table(comparison_df, level_name):
            """Create cumulative frequency table showing classes with most unique NDCs."""
            # Calculate total NDCs across all years for each class
            total_ndcs = comparison_df.sum(axis=1).sort_values(ascending=False)
            
            # Create frequency table
            freq_table = pd.DataFrame({
                'ATC_Class': total_ndcs.index,
                'Total_Unique_NDCs': total_ndcs.values,
                'Percentage': (total_ndcs.values / total_ndcs.sum() * 100).round(2)
            })
            
            # Add cumulative frequency and percentage
            freq_table['Cumulative_NDCs'] = freq_table['Total_Unique_NDCs'].cumsum()
            freq_table['Cumulative_Percentage'] = freq_table['Percentage'].cumsum().round(2)
            
            # Reset index
            freq_table.reset_index(drop=True, inplace=True)
            freq_table.index = freq_table.index + 1  # Start ranking from 1
            
            return freq_table
        
        # Generate cumulative frequency tables
        atc2_freq_table = create_cumulative_frequency_table(atc2_comparison, 'ATC2')
        atc3_freq_table = create_cumulative_frequency_table(atc3_comparison, 'ATC3')
        
        # Display results
        print("\n" + "="*80)
        print("CUMULATIVE FREQUENCY ANALYSIS - ATC2 CLASSES BY UNIQUE NDCs")
        print("="*80)
        print("Top 20 ATC2 classes with most unique NDCs across all years:")
        print(atc2_freq_table.head(20).to_string())
        
        # Show concentration analysis for ATC2
        top_5_atc2 = atc2_freq_table.head(5)['Cumulative_Percentage'].iloc[-1]
        top_10_atc2 = atc2_freq_table.head(10)['Cumulative_Percentage'].iloc[-1]
        print(f"\nConcentration Analysis (ATC2):")
        print(f"  Top 5 classes account for {top_5_atc2:.1f}% of all unique NDCs")
        print(f"  Top 10 classes account for {top_10_atc2:.1f}% of all unique NDCs")
        
        print("\n" + "="*80)
        print("CUMULATIVE FREQUENCY ANALYSIS - ATC3 CLASSES BY UNIQUE NDCs")
        print("="*80)
        print("Top 20 ATC3 classes with most unique NDCs across all years:")
        print(atc3_freq_table.head(20).to_string())
        
        # Show concentration analysis for ATC3
        top_5_atc3 = atc3_freq_table.head(5)['Cumulative_Percentage'].iloc[-1]
        top_10_atc3 = atc3_freq_table.head(10)['Cumulative_Percentage'].iloc[-1]
        print(f"\nConcentration Analysis (ATC3):")
        print(f"  Top 5 classes account for {top_5_atc3:.1f}% of all unique NDCs")
        print(f"  Top 10 classes account for {top_10_atc3:.1f}% of all unique NDCs")
        
        return atc2_year_results, atc3_year_results, atc2_comparison, atc3_comparison, atc2_freq_table, atc3_freq_table
    
    @staticmethod
    def get_atc_ndc_details(year, top_n=10):
        """Get detailed NDC information for top ATC2/ATC3 classes in a specific year."""
        print(f"Analyzing ATC-NDC details for {year}...")
        print("="*60)
        
        analyzer = NDCATCAnalyzer(year=year)
        analyzer.clean_sdud_data()
        analyzer.adding_key()
        analyzer.analyze_atc4_mapping()
        
        records = analyzer.atc_mapping[analyzer.atc_mapping['ATC4 Class'].notna()].copy()
        records['ATC2 Class'] = records['ATC4 Class'].str[:3]
        records['ATC3 Class'] = records['ATC4 Class'].str[:4]
        
        # ATC2 details
        atc2_details = records.groupby('ATC2 Class').agg(
            Unique_NDCs=('NDC', 'nunique'),
            Total_Records=('record_id', 'nunique')
        ).sort_values('Unique_NDCs', ascending=False).head(top_n)
        
        # ATC3 details
        atc3_details = records.groupby('ATC3 Class').agg(
            Unique_NDCs=('NDC', 'nunique'),
            Total_Records=('record_id', 'nunique')
        ).sort_values('Unique_NDCs', ascending=False).head(top_n)
        
        print(f"\nTop {top_n} ATC2 Classes:")
        print(atc2_details)
        print(f"\nTop {top_n} ATC3 Classes:")
        print(atc3_details)
        
        return atc2_details, atc3_details


In [None]:
analyzer = NDCATCAnalyzer(year=2024)
analyzer.clean_sdud_data()           # Clean SDUD data
analyzer.adding_key()                # Add record_id key
analyzer.generate_ndc_txt()          # Generate NDC text file
analyzer.analyze_atc4_mapping()      # Merge ATC4 by record_id & NDC

In [None]:
atc2_dist = analyzer.analyze_atc_distribution(level='ATC2')
atc3_dist = analyzer.analyze_atc_distribution(level='ATC3')
atc4_dist = analyzer.analyze_atc_distribution(level='ATC4')

In [None]:
analyzer.fetch_atc_names()           
analyzer.prepare_final_dataframe()   
analyzer.export_merged_data()  
# Simple version - only percentages

In [None]:
#seeing unique NDC for unique ATC2 and ATC3 across years
years = [2020, 2021, 2022, 2023, 2024]
#NDCATCAnalyzer.create_multi_year_distribution_analysis(years)

# Capture all returned dataframes including the new cumulative frequency tables
atc2_year_results, atc3_year_results, atc2_comparison, atc3_comparison, atc2_freq_table, atc3_freq_table = NDCATCAnalyzer.analyze_general_atc_overview(years)

# Detailed ATC2 & ATC3 analysis for a specific year
#atc2_2023, atc3_2023 = NDCATCAnalyzer.get_atc_ndc_details(year=2024, top_n=10)
# Export combined cumulative frequency analysis with year-by-year data
import os

# Create output directory
output_dir = rf"c:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\exported_analysis"
os.makedirs(output_dir, exist_ok=True)

print("Creating combined cumulative frequency analysis files...")
print("="*60)

def create_combined_analysis(comparison_df, freq_table, level_name):
    """Combine cumulative frequency analysis with year-by-year breakdown"""
    # Start with the comparison table (ATC classes x years) - this already has year-by-year breakdown
    combined = comparison_df.copy()
    
    # Calculate total and percentage columns
    combined['Total_NDCs'] = combined.sum(axis=1)
    combined['Percentage'] = (combined['Total_NDCs'] / combined['Total_NDCs'].sum() * 100).round(2)
    
    # Sort by total NDCs (descending)
    combined = combined.sort_values('Total_NDCs', ascending=False)
    
    # Add cumulative columns
    combined['Cumulative_NDCs'] = combined['Total_NDCs'].cumsum()
    combined['Cumulative_Percentage'] = combined['Percentage'].cumsum().round(2)
    
    # Rename year columns to show actual years (NDCs_2020, NDCs_2021, etc.)
    column_mapping = {}
    for year in years:
        if year in combined.columns:
            column_mapping[year] = f'NDCs_{year}'
    
    combined = combined.rename(columns=column_mapping)
    
    # Reorder columns: year columns first, then summary columns
    year_cols = [f'NDCs_{year}' for year in years if year in comparison_df.columns]
    summary_cols = ['Total_NDCs', 'Percentage', 'Cumulative_NDCs', 'Cumulative_Percentage']
    final_cols = year_cols + summary_cols
    
    return combined[final_cols]

# Create combined tables
atc2_combined = create_combined_analysis(atc2_comparison, atc2_freq_table, 'ATC2')
atc3_combined = create_combined_analysis(atc3_comparison, atc3_freq_table, 'ATC3')

# Export the two main files
atc2_file = os.path.join(output_dir, "ATC2_Cumulative_Frequency_Analysis.csv")
atc3_file = os.path.join(output_dir, "ATC3_Cumulative_Frequency_Analysis.csv")

atc2_combined.to_csv(atc2_file, index=True)
atc3_combined.to_csv(atc3_file, index=True)

print(f"✓ ATC2 Combined Analysis: {atc2_file}")
print(f"✓ ATC3 Combined Analysis: {atc3_file}")

print(f"\nFiles exported to: {output_dir}")
print("\nEach file contains:")
print(f"- ATC Class (index)")
for year in years:
    print(f"- NDCs_{year}: Unique NDCs for {year}")
print(f"- Total_NDCs (sum across all years)")
print(f"- Percentage (% of grand total NDCs)")
print(f"- Cumulative_NDCs (running total)")
print(f"- Cumulative_Percentage (running %)")

# Show preview of the data
print(f"\nPreview of ATC2 file (top 10 classes):")
print(atc2_combined.head(10))

print(f"\nPreview of ATC3 file (top 10 classes):")
print(atc3_combined.head(10))

Creating ATC2 & ATC3 Overview: Unique NDCs per Class Across Years...
Processing 2020... Reading CSV: c:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2020.csv
Initial rows: 4,922,728
Initial rows: 4,922,728
After cleaning: 2,284,815 rows, 32,220 unique NDCs
After cleaning: 2,284,815 rows, 32,220 unique NDCs
Created 2,284,815 record IDs
Created 2,284,815 record IDs
ATC4 file: 4,011,219 rows, 27,661 unique NDCs
ATC4 file: 4,011,219 rows, 27,661 unique NDCs
Merged: 4,122,259 records, 4,011,219 with ATC4 (97.3%)
Missing: 111,040 records, 4,559 unique NDCs
Merged: 4,122,259 records, 4,011,219 with ATC4 (97.3%)
Missing: 111,040 records, 4,559 unique NDCs
✓ (ATC2: 90 classes, 27,661 NDCs; ATC3: 212 classes, 27,661 NDCs)
Processing 2021... Reading CSV: c:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2021.csv
✓ (ATC2: 90 classes, 27,661 NDCs; ATC3: 212 classes, 27,661 NDCs)
Processing 2021... Reading CSV: c:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\SDUD\SDUD2021.cs

In [None]:
#Just checking overlap between files with and without key
nokey_path=rf'C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\ATC4_classes\Classes_notgood\NDCf_2023_ATC4_classes.csv'
keyed_path=rf'C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\ATC4_classes\NDCNEW_2023_ATC4_classes.csv'

# Load them
keyed = pd.read_csv(keyed_path, dtype=str)
nokey = pd.read_csv(nokey_path, dtype=str)

# Normalize NDCs (remove hyphens, pad to 11 digits)
for df in [keyed, nokey]:
    df["NDC"] = df["NDC"].str.replace("-", "", regex=False).str.zfill(11)

# --- Summary stats ---
summary = {
    "File": ["With key (NDCNEW_2024_ATC4_classes)", "Without key (NDCf_2024_ATC4_classes)"],
    "Total rows": [len(keyed), len(nokey)],
    "Unique NDCs": [keyed["NDC"].nunique(), nokey["NDC"].nunique()],
    "Mapped NDCs (non-null ATC)": [
        keyed["ATC4 Class"].notna().sum(),
        nokey["ATC4 Class"].notna().sum(),
    ],
}
summary_df = pd.DataFrame(summary)

# --- Compare overlap of unique NDCs ---
ndc_keyed = set(keyed["NDC"].unique())
ndc_nokey = set(nokey["NDC"].unique())

overlap_ndcs = len(ndc_keyed & ndc_nokey)
only_in_nokey = len(ndc_nokey - ndc_keyed)
only_in_keyed = len(ndc_keyed - ndc_nokey)

comparison = pd.DataFrame({
    "Metric": ["Overlap NDCs", "Only in without-key file", "Only in with-key file", "Percent overlap"],
    "Value": [overlap_ndcs, only_in_nokey, only_in_keyed, overlap_ndcs / len(ndc_nokey) * 100]
})

print("\n=== Summary of Each File ===")
print(summary_df.to_string(index=False))

print("\n=== NDC Overlap Comparison ===")
print(comparison.to_string(index=False))