# **Knowledge Base Integration**

**Load Phase 3 Results & Setup**

In [None]:
# ============================================================================
# PHASE 4: KNOWLEDGE BASE INTEGRATION
# Load Previous Results and Setup
# ============================================================================

print("="*70)
print(" " * 15 + "PHASE 4: KNOWLEDGE BASE INTEGRATION")
print("="*70)

# If starting fresh session, mount drive and load data
from google.colab import drive
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
from functools import lru_cache
import time
from datetime import datetime, timedelta

# Mount drive if not already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted")

# Load paths
project_name = "MediScope_OCR_Project"
drive_project_path = f"/content/drive/MyDrive/{project_name}"
phase3_drive_path = os.path. join(drive_project_path, "Phase3_NER")

# Load NER results
print("\nüìÇ Loading NER results from Phase 3...")
ner_results_path = os.path.join(phase3_drive_path, 'ner_extraction_results.csv')

if os.path.exists(ner_results_path):
    ner_results_df = pd.read_csv(ner_results_path)
    print(f"‚úÖ Loaded {len(ner_results_df)} NER results")

    # Show drug extraction stats
    drug_found = ner_results_df['drug_name'].notna().sum()
    print(f"   Drug names found: {drug_found}/{len(ner_results_df)} ({drug_found/len(ner_results_df):.1%})")

    # Show unique drugs
    unique_drugs = ner_results_df['drug_name'].dropna().unique()
    print(f"   Unique drug names: {len(unique_drugs)}")

    # Sample drugs
    print(f"   Sample drugs: {', '.join(list(unique_drugs)[:10])}")
else:
    print("‚ö†Ô∏è  NER results not found!  Please complete Phase 3 first.")
    ner_results_df = None

               PHASE 4: KNOWLEDGE BASE INTEGRATION

üìÇ Loading NER results from Phase 3...
‚úÖ Loaded 10 NER results
   Drug names found: 0/10 (0.0%)
   Unique drug names: 0
   Sample drugs: 


**Step 16: Download Drug Databases**

In [None]:
print("\n" + "="*70)
print(" " * 20 + "Step 16: Download REAL Drug Databases")
print("="*70)

import urllib.request
import zipfile
import json
import requests
from io import BytesIO
from tqdm import tqdm # Added import
import time # Added import
import os # Added import for self-containment
import pandas as pd # Added import for self-containment

class ComprehensiveDrugDatabaseDownloader:
    """
    Download real, comprehensive drug databases from multiple sources
    """

    def __init__(self, output_dir='drug_databases'):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.databases = []

    # ========================================================================
    # METHOD 1: RxNorm API (NIH - National Library of Medicine)
    # ========================================================================

    def download_rxnorm_drugs(self, max_drugs=5000):
        """
        Download drug information from RxNorm API (FREE, comprehensive)
        RxNorm contains ~100,000+ drug concepts
        """
        print("\nüì• Downloading from RxNorm API (NIH)...")
        print(f"   This may take 10-15 minutes for {max_drugs} drugs...")

        base_url = "https://rxnav.nlm.nih.gov/REST"

        drugs_data = []

        # Get all drug names first
        print("   Step 1: Getting drug names...")

        try:
            # Get all drugs with RxNorm
            url = f"{base_url}/allconcepts.json?tty=IN"  # IN = Ingredient - Fixed typo here!
            response = requests.get(url, timeout=30)

            if response.status_code == 200:
                data = response.json()
                all_drugs = data.get('minConceptGroup', {}).get('minConcept', [])

                print(f"   Found {len(all_drugs)} drug ingredients")

                # Limit to max_drugs
                drugs_to_process = all_drugs[:max_drugs]

                print(f"   Step 2: Getting details for {len(drugs_to_process)} drugs...")

                for idx, drug in enumerate(tqdm(drugs_to_process, desc="Fetching RxNorm")):
                    try:
                        rxcui = drug['rxcui']
                        drug_name = drug['name']

                        # Get drug properties
                        prop_url = f"{base_url}/rxcui/{rxcui}/allProperties.json?prop=all"
                        prop_response = requests.get(prop_url, timeout=5)

                        drug_info = {
                            'drug_name': drug_name,
                            'rxcui': rxcui,
                            'source': 'rxnorm',
                            'generic_name': drug_name,
                        }

                        if prop_response.status_code == 200:
                            props = prop_response.json()

                            # Extract properties
                            if 'propConceptGroup' in props:
                                for group in props['propConceptGroup']:
                                    if 'propConcept' in group:
                                        for concept in group['propConcept']:
                                            prop_name = concept.get('propName', '')
                                            prop_value = concept.get('propValue', '')

                                            if prop_name == 'RxNorm Name':
                                                drug_info['generic_name'] = prop_value
                                            elif prop_name == 'TTY':
                                                drug_info['term_type'] = prop_value

                        # Get related info (drug class, etc.)
                        related_url = f"{base_url}/rxcui/{rxcui}/related.json?tty=IN+PIN"
                        related_response = requests.get(related_url, timeout=5)

                        if related_response.status_code == 200:
                            related_data = related_response.json()
                            # Extract related information

                        drugs_data.append(drug_info)

                        # Rate limiting
                        if idx % 100 == 0 and idx > 0:
                            print(f"   Progress: {idx}/{len(drugs_to_process)}")
                            time.sleep(1)

                    except Exception as e:
                        continue

                print(f"‚úÖ Downloaded {len(drugs_data)} drugs from RxNorm")

                # Save RxNorm data
                df = pd.DataFrame(drugs_data)
                rxnorm_path = os.path.join(self.output_dir, 'rxnorm_drugs.csv')
                df.to_csv(rxnorm_path, index=False)
                print(f"   Saved to: {rxnorm_path}")

                self.databases.append(('rxnorm', df))
                return df

            else:
                print(f"‚ö†Ô∏è  Failed to get all concepts from RxNorm. Status code: {response.status_code}")
                return None

        except Exception as e:
            print(f"‚ö†Ô∏è  RxNorm download failed: {e}")
            return None


# Instantiate the downloader to run the cell
downloader = ComprehensiveDrugDatabaseDownloader()


                    Step 16: Download REAL Drug Databases


In [None]:
# ========================================================================
# METHOD 1: RxNorm API (NIH - National Library of Medicine)
# ========================================================================

def download_rxnorm_drugs(self, max_drugs=5000):
    """
    Download drug information from RxNorm API (FREE, comprehensive)
    RxNorm contains ~100,000+ drug concepts
    """
    print("\nüì• Downloading from RxNorm API (NIH)...")
    print(f"   This may take 10-15 minutes for {max_drugs} drugs...")

    base_url = "https://rxnav.nlm.nih.gov/REST"

    drugs_data = []

    # Get all drug names first
    print("   Step 1: Getting drug names...")

    try:
        # Get all drugs with RxNorm
        url = f"{base_url}/allconcepts. json? tty=IN"  # IN = Ingredient
        response = requests.get(url, timeout=30)

        if response. status_code == 200:
            data = response.json()
            all_drugs = data.get('minConceptGroup', {}).get('minConcept', [])

            print(f"   Found {len(all_drugs)} drug ingredients")

            # Limit to max_drugs
            drugs_to_process = all_drugs[:max_drugs]

            print(f"   Step 2: Getting details for {len(drugs_to_process)} drugs...")

            for idx, drug in enumerate(tqdm(drugs_to_process, desc="Fetching RxNorm")):
                try:
                    rxcui = drug['rxcui']
                    drug_name = drug['name']

                    # Get drug properties
                    prop_url = f"{base_url}/rxcui/{rxcui}/allProperties.json?prop=all"
                    prop_response = requests.get(prop_url, timeout=5)

                    drug_info = {
                        'drug_name': drug_name,
                        'rxcui': rxcui,
                        'source': 'rxnorm',
                        'generic_name': drug_name,
                    }

                    if prop_response.status_code == 200:
                        props = prop_response.json()

                        # Extract properties
                        if 'propConceptGroup' in props:
                            for group in props['propConceptGroup']:
                                if 'propConcept' in group:
                                    for concept in group['propConcept']:
                                        prop_name = concept. get('propName', '')
                                        prop_value = concept.get('propValue', '')

                                        if prop_name == 'RxNorm Name':
                                            drug_info['generic_name'] = prop_value
                                        elif prop_name == 'TTY':
                                            drug_info['term_type'] = prop_value

                    # Get related info (drug class, etc.)
                    related_url = f"{base_url}/rxcui/{rxcui}/related.json?tty=IN+PIN"
                    related_response = requests.get(related_url, timeout=5)

                    if related_response.status_code == 200:
                        related_data = related_response.json()
                        # Extract related information

                    drugs_data.append(drug_info)

                    # Rate limiting
                    if idx % 100 == 0 and idx > 0:
                        print(f"   Progress: {idx}/{len(drugs_to_process)}")
                        time.sleep(1)

                except Exception as e:
                    continue

            print(f"‚úÖ Downloaded {len(drugs_data)} drugs from RxNorm")

            # Save RxNorm data
            df = pd.DataFrame(drugs_data)
            rxnorm_path = os.path.join(self.output_dir, 'rxnorm_drugs.csv')
            df.to_csv(rxnorm_path, index=False)
            print(f"   Saved to: {rxnorm_path}")

            self.databases.append(('rxnorm', df))
            return df

    except Exception as e:
        print(f"‚ö†Ô∏è  RxNorm download failed: {e}")
        return None

In [None]:
rxnorm_df = downloader.download_rxnorm_drugs()


üì• Downloading from RxNorm API (NIH)...
   This may take 10-15 minutes for 5000 drugs...
   Step 1: Getting drug names...
   Found 14609 drug ingredients
   Step 2: Getting details for 5000 drugs...


Fetching RxNorm:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 4763/5000 [22:11<01:04,  3.65it/s]

In [None]:
# ============================================================================
# Step 17 (UPDATED): Build Master Knowledge Base from Real Data
# ============================================================================

print("\n" + "="*70)
print(" " * 20 + "Step 17: Build Master Knowledge Base")
print("="*70)

class DrugKnowledgeBaseBuilder:
    """
    Build comprehensive drug knowledge base from downloaded databases
    """

    def __init__(self, database_dir='drug_databases'):
        self.database_dir = database_dir
        self. master_db = None
        self. brand_mappings = None

    def load_all_databases(self):
        """
        Load all downloaded databases
        """
        print("\nüìÇ Loading downloaded databases...")

        databases = {}

        # Check for WHO database
        who_path = os.path.join(self.database_dir, 'who_essential_medicines.csv')
        if os.path.exists(who_path):
            databases['who'] = pd.read_csv(who_path)
            print(f"   ‚úÖ WHO:  {len(databases['who'])} drugs")

        # Check for RxNorm database
        rxnorm_path = os.path. join(self.database_dir, 'rxnorm_drugs.csv')
        if os.path.exists(rxnorm_path):
            databases['rxnorm'] = pd.read_csv(rxnorm_path)
            print(f"   ‚úÖ RxNorm: {len(databases['rxnorm'])} drugs")

        # Check for FDA NDC database
        ndc_path = os.path. join(self.database_dir, 'fda_ndc_drugs.csv')
        if os.path.exists(ndc_path):
            databases['fda_ndc'] = pd.read_csv(ndc_path)
            print(f"   ‚úÖ FDA NDC: {len(databases['fda_ndc'])} drugs")

        if not databases:
            print("   ‚ö†Ô∏è  No databases found!  Using sample data...")
            return self._create_fallback_database()

        return databases

    def _create_fallback_database(self):
        """
        Create enhanced fallback database with more common drugs
        """
        print("\nüîß Creating enhanced fallback database...")

        common_drugs = {
            # Analgesics & Anti-inflammatory
            'Aspirin': {'class': 'NSAID', 'use': 'Pain relief, fever reduction, anti-inflammatory, cardiovascular protection'},
            'Paracetamol': {'class':  'Analgesic', 'use': 'Pain relief, fever reduction'},
            'Acetaminophen': {'class': 'Analgesic', 'use': 'Pain relief, fever reduction'},
            'Ibuprofen': {'class': 'NSAID', 'use':  'Pain relief, anti-inflammatory, fever reduction'},
            'Naproxen': {'class': 'NSAID', 'use': 'Pain relief, anti-inflammatory'},
            'Diclofenac': {'class':  'NSAID', 'use': 'Pain relief, anti-inflammatory'},

            # Antibiotics
            'Amoxicillin': {'class':  'Penicillin Antibiotic', 'use': 'Bacterial infections'},
            'Azithromycin': {'class': 'Macrolide Antibiotic', 'use': 'Bacterial infections'},
            'Ciprofloxacin': {'class': 'Fluoroquinolone', 'use': 'Bacterial infections'},
            'Doxycycline': {'class': 'Tetracycline', 'use': 'Bacterial infections'},
            'Cephalexin': {'class': 'Cephalosporin', 'use': 'Bacterial infections'},
            'Metronidazole': {'class':  'Antibiotic', 'use': 'Bacterial and protozoal infections'},

            # Cardiovascular
            'Atenolol': {'class': 'Beta Blocker', 'use': 'High blood pressure, angina'},
            'Metoprolol': {'class': 'Beta Blocker', 'use': 'High blood pressure, heart failure'},
            'Amlodipine': {'class':  'Calcium Channel Blocker', 'use': 'High blood pressure, angina'},
            'Lisinopril': {'class': 'ACE Inhibitor', 'use': 'High blood pressure, heart failure'},
            'Enalapril': {'class': 'ACE Inhibitor', 'use': 'High blood pressure, heart failure'},
            'Losartan': {'class': 'ARB', 'use': 'High blood pressure'},
            'Hydrochlorothiazide': {'class': 'Diuretic', 'use': 'High blood pressure, edema'},
            'Furosemide': {'class': 'Loop Diuretic', 'use': 'Edema, heart failure'},
            'Atorvastatin': {'class': 'Statin', 'use': 'High cholesterol'},
            'Simvastatin': {'class': 'Statin', 'use': 'High cholesterol'},
            'Rosuvastatin': {'class': 'Statin', 'use': 'High cholesterol'},
            'Clopidogrel': {'class':  'Antiplatelet', 'use': 'Blood clot prevention'},
            'Warfarin': {'class': 'Anticoagulant', 'use': 'Blood clot prevention'},

            # Diabetes
            'Metformin':  {'class': 'Biguanide', 'use': 'Type 2 diabetes'},
            'Glipizide': {'class': 'Sulfonylurea', 'use': 'Type 2 diabetes'},
            'Glyburide': {'class': 'Sulfonylurea', 'use': 'Type 2 diabetes'},
            'Insulin': {'class': 'Hormone', 'use': 'Diabetes mellitus'},

            # Gastrointestinal
            'Omeprazole': {'class':  'Proton Pump Inhibitor', 'use': 'GERD, ulcers, acid reflux'},
            'Pantoprazole': {'class': 'Proton Pump Inhibitor', 'use': 'GERD, ulcers'},
            'Esomeprazole': {'class':  'Proton Pump Inhibitor', 'use': 'GERD, ulcers'},
            'Ranitidine': {'class': 'H2 Blocker', 'use': 'GERD, ulcers'},
            'Ondansetron': {'class': 'Antiemetic', 'use': 'Nausea, vomiting'},
            'Metoclopramide': {'class':  'Antiemetic', 'use': 'Nausea, GERD'},

            # Respiratory
            'Salbutamol': {'class': 'Beta-2 Agonist', 'use': 'Asthma, COPD'},
            'Albuterol': {'class': 'Beta-2 Agonist', 'use': 'Asthma, COPD'},
            'Montelukast': {'class': 'Leukotriene Receptor Antagonist', 'use': 'Asthma, allergies'},
            'Budesonide': {'class': 'Corticosteroid', 'use': 'Asthma, allergic rhinitis'},

            # Antihistamines & Allergies
            'Cetirizine': {'class': 'Antihistamine', 'use':  'Allergies, hay fever'},
            'Loratadine': {'class': 'Antihistamine', 'use':  'Allergies, hay fever'},
            'Fexofenadine': {'class': 'Antihistamine', 'use': 'Allergies'},
            'Diphenhydramine': {'class':  'Antihistamine', 'use': 'Allergies, sleep aid'},

            # Corticosteroids
            'Prednisone': {'class': 'Corticosteroid', 'use': 'Inflammation, autoimmune conditions'},
            'Prednisolone': {'class': 'Corticosteroid', 'use': 'Inflammation, allergies'},
            'Dexamethasone': {'class':  'Corticosteroid', 'use': 'Inflammation, edema'},
            'Hydrocortisone': {'class': 'Corticosteroid', 'use': 'Inflammation, skin conditions'},

            # Thyroid
            'Levothyroxine': {'class': 'Thyroid Hormone', 'use': 'Hypothyroidism'},

            # Psychiatric
            'Sertraline': {'class': 'SSRI Antidepressant', 'use': 'Depression, anxiety'},
            'Fluoxetine': {'class': 'SSRI Antidepressant', 'use': 'Depression, anxiety'},
            'Escitalopram': {'class': 'SSRI Antidepressant', 'use': 'Depression, anxiety'},
            'Amitriptyline': {'class': 'Tricyclic Antidepressant', 'use': 'Depression, neuropathic pain'},
            'Alprazolam': {'class': 'Benzodiazepine', 'use': 'Anxiety, panic disorder'},
            'Diazepam': {'class': 'Benzodiazepine', 'use': 'Anxiety, seizures, muscle spasms'},
            'Lorazepam': {'class': 'Benzodiazepine', 'use': 'Anxiety'},

            # Pain (Opioids)
            'Tramadol': {'class': 'Opioid Analgesic', 'use':  'Moderate to severe pain'},
            'Codeine': {'class': 'Opioid Analgesic', 'use': 'Pain, cough'},
            'Morphine': {'class': 'Opioid Analgesic', 'use': 'Severe pain'},

            # Vitamins & Supplements
            'Vitamin D':  {'class': 'Vitamin', 'use': 'Bone health, immune function'},
            'Vitamin B12': {'class': 'Vitamin', 'use': 'Anemia, nerve function'},
            'Folic Acid': {'class': 'Vitamin', 'use':  'Anemia prevention, pregnancy'},
            'Calcium': {'class': 'Mineral', 'use': 'Bone health'},
            'Iron':  {'class': 'Mineral', 'use': 'Anemia'},

            # Others
            'Gabapentin': {'class': 'Anticonvulsant', 'use': 'Neuropathic pain, seizures'},
            'Pregabalin': {'class': 'Anticonvulsant', 'use': 'Neuropathic pain, fibromyalgia'},
            'Allopurinol': {'class':  'Xanthine Oxidase Inhibitor', 'use': 'Gout'},
            'Tamsulosin': {'class': 'Alpha Blocker', 'use': 'Benign prostatic hyperplasia'},
        }

        # Convert to DataFrame
        fallback_data = []
        for drug, info in common_drugs.items():
            fallback_data.append({
                'drug_name': drug,
                'generic_name': drug,
                'drug_class': info['class'],
                'therapeutic_use': info['use'],
                'source': 'fallback'
            })

        df = pd.DataFrame(fallback_data)
        print(f"   ‚úÖ Created fallback database with {len(df)} common drugs")

        return {'fallback': df}

    def normalize_drug_name(self, name):
        """Normalize drug name for matching"""
        if pd.isna(name) or not name:
            return None
        return str(name).lower().strip().replace('¬Æ', '').replace('‚Ñ¢', '').replace('-', ' ')

    def merge_databases(self, databases):
        """
        Merge all databases into master knowledge base
        """
        print("\nüî® Merging databases into master knowledge base...")

        all_drugs = []

        for source_name, df in databases.items():
            print(f"   Processing {source_name}:   {len(df)} drugs")

            # Standardize column names
            df_copy = df.copy()

            # Ensure required columns exist
            if 'drug_name' not in df_copy.columns:
                if 'name' in df_copy.columns:
                    df_copy['drug_name'] = df_copy['name']
                else:
                    continue

            # Add source
            df_copy['data_source'] = source_name

            # Add normalized name
            df_copy['drug_name_normalized'] = df_copy['drug_name'].apply(self.normalize_drug_name)

            # Ensure other columns
            for col in ['generic_name', 'drug_class', 'therapeutic_use']:
                if col not in df_copy.columns:
                    df_copy[col] = None

            all_drugs.append(df_copy)

        # Combine all
        master = pd.concat(all_drugs, ignore_index=True)

        # Remove entries without drug name
        master = master[master['drug_name_normalized']. notna()]

        print(f"\n   üìä Before deduplication: {len(master)} entries")

        # Deduplicate - keep entry with most information
        master['info_completeness'] = (
            master['generic_name'].notna().astype(int) +
            master['drug_class'].notna().astype(int) +
            master['therapeutic_use'].notna().astype(int)
        )

        master = master.sort_values('info_completeness', ascending=False)
        master = master.drop_duplicates(subset=['drug_name_normalized'], keep='first')

        print(f"   üìä After deduplication:   {len(master)} unique drugs")

        self.master_db = master
        return master

    def create_brand_name_mappings(self):
        """
        Create brand-to-generic name mappings
        """
        print("\nüìù Creating brand name mappings...")

        # Common brand-to-generic mappings
        brand_to_generic = {
            # Pain relievers
            'tylenol': 'paracetamol',
            'panadol': 'paracetamol',
            'crocin': 'paracetamol',
            'advil':  'ibuprofen',
            'motrin': 'ibuprofen',
            'brufen': 'ibuprofen',
            'nurofen': 'ibuprofen',
            'aleve': 'naproxen',
            'bayer': 'aspirin',
            'ecotrin': 'aspirin',
            'disprin': 'aspirin',
            'voltaren': 'diclofenac',

            # Cardiovascular
            'lipitor': 'atorvastatin',
            'crestor': 'rosuvastatin',
            'zocor': 'simvastatin',
            'norvasc': 'amlodipine',
            'prinivil': 'lisinopril',
            'zestril': 'lisinopril',
            'lopressor': 'metoprolol',
            'toprol': 'metoprolol',
            'tenormin': 'atenolol',
            'coumadin': 'warfarin',
            'plavix': 'clopidogrel',
            'lasix': 'furosemide',

            # Gastrointestinal
            'prilosec': 'omeprazole',
            'nexium':  'esomeprazole',
            'prevacid': 'lansoprazole',
            'protonix': 'pantoprazole',
            'zantac': 'ranitidine',
            'pepcid': 'famotidine',
            'zofran': 'ondansetron',

            # Antibiotics
            'augmentin': 'amoxicillin',
            'amoxil': 'amoxicillin',
            'zithromax': 'azithromycin',
            'cipro': 'ciprofloxacin',
            'flagyl': 'metronidazole',
            'keflex': 'cephalexin',

            # Diabetes
            'glucophage': 'metformin',

            # Respiratory
            'ventolin': 'salbutamol',
            'proventil': 'albuterol',
            'singulair': 'montelukast',

            # Antihistamines
            'zyrtec': 'cetirizine',
            'claritin': 'loratadine',
            'allegra': 'fexofenadine',
            'benadryl':  'diphenhydramine',

            # Thyroid
            'synthroid': 'levothyroxine',
            'levoxyl': 'levothyroxine',

            # Psychiatric
            'zoloft': 'sertraline',
            'prozac': 'fluoxetine',
            'lexapro':  'escitalopram',
            'xanax': 'alprazolam',
            'valium': 'diazepam',
            'ativan': 'lorazepam',

            # Pain (opioids)
            'ultram': 'tramadol',

            # Others
            'neurontin': 'gabapentin',
            'lyrica': 'pregabalin',
            'flomax': 'tamsulosin',
        }

        # Create mappings DataFrame
        mappings = []

        for brand, generic in brand_to_generic.items():
            # Check if generic exists in master database
            if self.master_db is not None:
                generic_normalized = self.normalize_drug_name(generic)

                matches = self.master_db[
                    self.master_db['drug_name_normalized'] == generic_normalized
                ]

                if not matches.empty:
                    mappings.append({
                        'brand_name': brand,
                        'brand_name_normalized': self.normalize_drug_name(brand),
                        'maps_to_generic': generic,
                        'maps_to_normalized': generic_normalized
                    })

        self.brand_mappings = pd.DataFrame(mappings)

        print(f"   ‚úÖ Created {len(self.brand_mappings)} brand name mappings")

        return self.brand_mappings

    def save_knowledge_base(self, output_dir='. '):
        """
        Save master knowledge base and mappings
        """
        print("\nüíæ Saving knowledge base...")

        if self.master_db is not None:
            kb_path = os.path.join(output_dir, 'medicine_knowledge_base.csv')
            self.master_db.to_csv(kb_path, index=False)
            print(f"   ‚úÖ Master KB saved:  {kb_path}")
            print(f"      Total drugs: {len(self.master_db)}")

        if self.brand_mappings is not None:
            mappings_path = os.path. join(output_dir, 'medicine_knowledge_base_brand_mappings.csv')
            self.brand_mappings.to_csv(mappings_path, index=False)
            print(f"   ‚úÖ Brand mappings saved:   {mappings_path}")
            print(f"      Total mappings: {len(self.brand_mappings)}")

    def get_statistics(self):
        """
        Get knowledge base statistics
        """
        if self.master_db is None:
            return None

        stats = {
            'total_drugs': len(self.master_db),
            'unique_classes': self.master_db['drug_class'].nunique(),
            'sources': self.master_db['data_source'].value_counts().to_dict(),
            'with_generic_name': self.master_db['generic_name'].notna().sum(),
            'with_drug_class': self.master_db['drug_class'].notna().sum(),
            'with_therapeutic_use': self.master_db['therapeutic_use'].notna().sum(),
            'brand_mappings': len(self. brand_mappings) if self.brand_mappings is not None else 0
        }

        return stats

# ============================================================================
# Execute Step 17
# ============================================================================

print("\nüöÄ Building master knowledge base...")

# Initialize builder
kb_builder = DrugKnowledgeBaseBuilder()

# Load all databases
databases = kb_builder.load_all_databases()

# Merge into master
master_kb = kb_builder.merge_databases(databases)

# Create brand mappings
brand_mappings = kb_builder.create_brand_name_mappings()

# Save everything
kb_builder.save_knowledge_base()

# Show statistics
stats = kb_builder. get_statistics()

print("\n" + "="*70)
print("KNOWLEDGE BASE STATISTICS")
print("="*70)

for key, value in stats.items():
    if key == 'sources':
        print(f"\n   Data Sources:")
        for source, count in value.items():
            print(f"      {source}: {count} drugs")
    else:
        print(f"   {key. replace('_', ' ').title()}: {value}")

print("="*70)

# Show sample drugs
print("\nüìã Sample Drugs in Knowledge Base:")
print("="*70)

if master_kb is not None:
    sample = master_kb[master_kb['drug_class']. notna()]. head(10)

    for idx, row in sample.iterrows():
        print(f"\n{row['drug_name']}")
        print(f"   Class: {row. get('drug_class', 'N/A')}")
        print(f"   Use: {str(row.get('therapeutic_use', 'N/A'))[:80]}...")
        print(f"   Source: {row. get('data_source', 'N/A')}")

print("\n‚úÖ Step 17 Complete:   Master knowledge base built with real data!")