## **Installing Required Packages**

In [2]:
# Install necessary packages
!pip install biopython pandas numpy requests python-dotenv

# For medical NLP
!pip install spacy scispacy

Collecting spacy
  Using cached spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting scispacy
  Using cached scispacy-0.6.2-py3-none-any.whl.metadata (20 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.10-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasab

In [3]:
#for vector embedding generation
!pip install transformers sentence-transformers torch pandas numpy scikit-learn



In [4]:
  #for vector database setup

# Install FAISS and dependencies
!pip install faiss-cpu

# Other utilities
!pip install numpy pandas scikit-learn sentence-transformers



In [5]:
# Install LLM libraries for RAG pipeline
!pip install openai anthropic  # For API-based LLMs
# OR
!pip install transformers torch  # For local open-source LLMs

# Install RAG framework
!pip install langchain langchain-openai langchain-anthropic

# Utilities
!pip install python-dotenv tiktoken

Collecting anthropic
  Downloading anthropic-0.72.0-py3-none-any.whl.metadata (28 kB)
Collecting docstring-parser<1,>=0.15 (from anthropic)
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Downloading anthropic-0.72.0-py3-none-any.whl (357 kB)
Downloading docstring_parser-0.17.0-py3-none-any.whl (36 kB)
Installing collected packages: docstring-parser, anthropic
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [anthropic]
[1A[2KSuccessfully installed anthropic-0.72.0 docstring-parser-0.17.0
Collecting langchain
  Downloading langchain-1.0.3-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
Collecting langchain-anthropic
  Downloading langchain_anthropic-1.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-core<2.0.0,>=1.0.0 (from langchain)
  Downloading langchain_core-1.0.2-py3-none-any.whl.metadata (3.5 kB)
Collecting langgraph<1.1.0,>=1.0.2 (fro

In [70]:
!pip install google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.186.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.42.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf (from google-generativeai)
  Downloading protobuf-5.29.5-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting googleapis-comm

## **Collecting Data**

In [7]:
from Bio import Entrez
import pandas as pd
import time
from datetime import datetime

Entrez.email = "meghsuhanths2306@gmail.com"

def collect_pubmed_fixed(search_term, max_results=100):
    """
    Fixed PubMED collection with proper XML handling
    """

    print(f"Collecting articles for: {search_term}")


    #Searching for PMIDs
    try:
        handle = Entrez.esearch(
            db="pubmed",
            term=search_term,
            retmax=max_results,
            sort="relevance"
        )

        results = Entrez.read(handle)
        handle.close()

        pmids = results['IdList']
        total_count = int(results['Count'])

        print(f"Found {total_count} total articles")
        print(f"Retrieving {len(pmids)} articles\n")

        if len(pmids) == 0:
            print("No PMIDs found!")
            return []

    except Exception as e:
        print(f"Search failed: {e}")
        return []

    #Fetching abstracts
    print("Step 2: Fetching article details...")
    articles = []
    batch_size = 50

    for i in range(0, len(pmids), batch_size):
        batch_pmids = pmids[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(pmids) + batch_size - 1) // batch_size

        print(f"  Fetching batch {batch_num}/{total_batches} ({len(batch_pmids)} articles)...")

        try:

            handle = Entrez.efetch(
                db="pubmed",
                id=batch_pmids,
                rettype="abstract",
                retmode="xml"
            )
            records = Entrez.read(handle)
            handle.close()

            # Processing each article
            for record in records['PubmedArticle']:
                try:
                    medline_citation = record['MedlineCitation']
                    article = medline_citation['Article']

                    # Extracting PMID, title, abstract, journal, publication date, author
                    pmid = str(medline_citation['PMID'])
                    title = article.get('ArticleTitle', '')

                    abstract = ''
                    if 'Abstract' in article:
                        abstract_texts = article['Abstract'].get('AbstractText', [])
                        # Joining all abstract parts
                        abstract = ' '.join([str(text) for text in abstract_texts])

                    journal = article.get('Journal', {}).get('Title', '')

                    # Extracting publication date
                    pub_date = ''
                    if 'Journal' in article and 'JournalIssue' in article['Journal']:
                        pub_info = article['Journal']['JournalIssue'].get('PubDate', {})
                        year = pub_info.get('Year', '')
                        month = pub_info.get('Month', '')
                        pub_date = f"{year}-{month}" if year else ''

                    authors = []
                    if 'AuthorList' in article:
                        for author in article['AuthorList']:
                            if 'LastName' in author:
                                name = f"{author.get('LastName', '')} {author.get('Initials', '')}"
                                authors.append(name.strip())

                    # Only keep if we have an abstract
                    if abstract and len(abstract) > 50:
                        article_data = {
                            'pmid': pmid,
                            'title': title,
                            'abstract': abstract,
                            'journal': journal,
                            'publication_date': pub_date,
                            'authors': ', '.join(authors),
                            'num_authors': len(authors),
                            'abstract_length': len(abstract)
                        }
                        articles.append(article_data)

                except Exception as e:
                    print(f"    Warning: Skipped one article: {str(e)[:50]}")
                    continue

            print(f"  Collected {len(articles)} articles so far")
            time.sleep(0.4)

        except Exception as e:
            print(f"    Batch {batch_num} failed: {e}")
            continue

    print(f"\n Collection complete: {len(articles)} articles with abstracts\n")
    return articles

In [8]:
# just testing with small data

print("TESTING DATA COLLECTION")

# collecting 50 diabetes articles
test_data = collect_pubmed_fixed("diabetes", max_results=50)

if len(test_data) > 0:
    print(f"Collected {len(test_data)} articles\n")

    print("Sample Article:")
    print(f"PMID: {test_data[0]['pmid']}")
    print(f"Title: {test_data[0]['title']}")
    print(f"Journal: {test_data[0]['journal']}")
    print(f"Authors: {test_data[0]['authors'][:80]}...")
    print(f"Abstract length: {test_data[0]['abstract_length']} characters")
    print(f"\nAbstract preview:")
    print(test_data[0]['abstract'][:300] + "...\n")

    # Converting to DataFrame
    df = pd.DataFrame(test_data)
    print(f"DataFrame shape: {df.shape}")
    print(f"\nDataFrame columns: {list(df.columns)}")

else:
    print("\n No articles collected")

TESTING DATA COLLECTION
Collecting articles for: diabetes
Found 1081017 total articles
Retrieving 50 articles

Step 2: Fetching article details...
  Fetching batch 1/1 (50 articles)...
  Collected 38 articles so far

 Collection complete: 38 articles with abstracts

Collected 38 articles

Sample Article:
PMID: 32741486
Title: Diabetes Insipidus: An Update.
Journal: Endocrinology and metabolism clinics of North America
Authors: Refardt J, Winzeler B, Christ-Crain M...
Abstract length: 775 characters

Abstract preview:
The differential diagnosis of diabetes insipidus involves the distinction between central or nephrogenic diabetes insipidus and primary polydipsia. Differentiation is important because treatment strategies vary; the wrong treatment can be dangerous. Reliable differentiation is difficult especially i...

DataFrame shape: (38, 8)

DataFrame columns: ['pmid', 'title', 'abstract', 'journal', 'publication_date', 'authors', 'num_authors', 'abstract_length']


In [9]:
def collect_medical_specialties(max_per_specialty=10000):
    """
    Collect data for all medical specialties
    """

    # medical specialties
    specialties = {
        'cardiology': 'cardiology AND 2020:2024[DP] AND English[LA]',
        'diabetes': 'diabetes mellitus AND 2020:2024[DP] AND English[LA]',
        'infectious_diseases': 'infectious diseases AND 2020:2024[DP] AND English[LA]'
    }

    all_articles = []

    for specialty_name, query in specialties.items():
        print(f"COLLECTING: {specialty_name.upper()}")

        # Collect articles
        articles = collect_pubmed_fixed(query, max_results=max_per_specialty)

        # Adding specialty label
        for article in articles:
            article['specialty'] = specialty_name
            article['collection_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        all_articles.extend(articles)

        print(f"\n {specialty_name}: {len(articles)} articles collected")
        print(f"Total so far: {len(all_articles)} articles\n")

        # Saving checkpoints
        checkpoint_df = pd.DataFrame(articles)
        checkpoint_df.to_csv(f'data_{specialty_name}_checkpoint.csv', index=False)
        print(f"Checkpoint saved: data_{specialty_name}_checkpoint.csv")

        # Wait between specialties
        time.sleep(2)

    return all_articles

automatically saves checkpoints. So, we can stop whenever we want

In [10]:
#Full collection for 3 specialities
medical_data = collect_medical_specialties(max_per_specialty=1000)  # with 1K per specialty

df_final = pd.DataFrame(medical_data)

# Save final dataset
df_final.to_csv('medical_literature_dataset.csv', index=False)
df_final.to_json('medical_literature_dataset.json', orient='records', indent=2)

print(f"Total articles: {len(df_final)}")
print(f"Specialties: {df_final['specialty'].value_counts().to_dict()}")
print(f"\nFiles saved:")
print(" medical_literature_dataset.csv")
print(" medical_literature_dataset.json")

COLLECTING: CARDIOLOGY
Collecting articles for: cardiology AND 2020:2024[DP] AND English[LA]
Found 222692 total articles
Retrieving 1000 articles

Step 2: Fetching article details...
  Fetching batch 1/20 (50 articles)...
  Collected 33 articles so far
  Fetching batch 2/20 (50 articles)...
  Collected 59 articles so far
  Fetching batch 3/20 (50 articles)...
  Collected 90 articles so far
  Fetching batch 4/20 (50 articles)...
  Collected 121 articles so far
  Fetching batch 5/20 (50 articles)...
  Collected 145 articles so far
  Fetching batch 6/20 (50 articles)...
  Collected 169 articles so far
  Fetching batch 7/20 (50 articles)...
  Collected 189 articles so far
  Fetching batch 8/20 (50 articles)...
  Collected 214 articles so far
  Fetching batch 9/20 (50 articles)...
  Collected 227 articles so far
  Fetching batch 10/20 (50 articles)...
  Collected 238 articles so far
  Fetching batch 11/20 (50 articles)...
  Collected 264 articles so far
  Fetching batch 12/20 (50 articles).

## **Dataset Info & Display Example**

In [11]:
# Load and display dataset information
df = pd.read_csv('medical_literature_dataset.csv')

print("DATASET INFO", '\n')

print(f"\n1. Dataset Shape: {df.shape}")
print(f"   - Rows (articles): {df.shape[0]}")
print(f"   - Columns (features): {df.shape[1]}")

print(f"\n2. Column Names and Types:")
print(df.dtypes)

print(f"\n3. Articles per Specialty:")
print(df['specialty'].value_counts())

print(f"\n4. Missing Values:")
print(df.isnull().sum())

print(f"\n5. Abstract Length Statistics:")
print(df['abstract_length'].describe())

DATASET INFO 


1. Dataset Shape: (1954, 10)
   - Rows (articles): 1954
   - Columns (features): 10

2. Column Names and Types:
pmid                 int64
title               object
abstract            object
journal             object
publication_date    object
authors             object
num_authors          int64
abstract_length      int64
specialty           object
collection_date     object
dtype: object

3. Articles per Specialty:
specialty
infectious_diseases    785
diabetes               728
cardiology             441
Name: count, dtype: int64

4. Missing Values:
pmid                0
title               0
abstract            0
journal             0
publication_date    4
authors             9
num_authors         0
abstract_length     0
specialty           0
collection_date     0
dtype: int64

5. Abstract Length Statistics:
count    1954.000000
mean     1370.468270
std       508.439668
min        54.000000
25%      1026.750000
50%      1393.500000
75%      1706.000000
max      32

In [12]:
print(f"\n6. First 5 Rows:")
print(df.head())

print(f"\n7. Sample Abstract:")
sample = df.iloc[0]
print(f"PMID: {sample['pmid']}")
print(f"Title: {sample['title']}")
print(f"Specialty: {sample['specialty']}")
print(f"Abstract: {sample['abstract'][:500]}...")


6. First 5 Rows:
       pmid                                              title  \
0  33332149  2020 ACC/AHA Guideline for the Management of P...   
1  32370835  Artificial Intelligence in Cardiology: Present...   
2  34338485                   Machine learning for cardiology.   
3  32216916  Evaluation for Heart Transplantation and LVAD ...   
4  38593946  Artificial Intelligence for Cardiovascular Car...   

                                            abstract  \
0  This executive summary of the valvular heart d...   
1  Artificial intelligence (AI) is a nontechnical...   
2  This paper reviews recent cardiology literatur...   
3  Timely referrals for transplantation and left ...   
4  Recent artificial intelligence (AI) advancemen...   

                                         journal publication_date  \
0                                    Circulation         2021-Feb   
1                        Mayo Clinic proceedings         2020-May   
2               Minerva cardiology and an

## **Risk Management & Trustworthiness**

In [13]:
#1. Data Quality Validation
import pandas as pd
import numpy as np
from datetime import datetime
import re

def validate_data_quality(df):
    """
    Comprehensive data quality validation for medical literature
    Risk Mitigation: Ensures data quality and completeness
    """
    print("DATA QUALITY VALIDATION")

    quality_report = {}
    issues_found = []

    # 1. Check for missing critical fields
    print("\n1. Checking for missing critical fields...")
    critical_fields = ['pmid', 'title', 'abstract', 'journal']

    for field in critical_fields:
        missing_count = df[field].isnull().sum()
        missing_pct = (missing_count / len(df)) * 100
        quality_report[f'missing_{field}'] = missing_count

        print(f"   {field}: {missing_count} missing ({missing_pct:.2f}%)")

        if missing_count > 0:
            issues_found.append(f"{field} has {missing_count} missing values")

    # 2. Check abstract length quality
    print("\n2. Checking abstract quality...")

    # Very short abstracts (likely incomplete)
    short_abstracts = (df['abstract_length'] < 100).sum()
    short_pct = (short_abstracts / len(df)) * 100
    quality_report['short_abstracts'] = short_abstracts

    print(f"   Very short abstracts (<100 chars): {short_abstracts} ({short_pct:.2f}%)")

    if short_pct > 5:
        issues_found.append(f"High percentage of short abstracts: {short_pct:.2f}%")

    # Very long abstracts (might be corrupted)
    long_abstracts = (df['abstract_length'] > 5000).sum()
    quality_report['long_abstracts'] = long_abstracts

    print(f"   Very long abstracts (>5000 chars): {long_abstracts}")

    # 3. Check for duplicate PMIDs
    print("\n3. Checking for duplicates...")
    duplicates = df['pmid'].duplicated().sum()
    quality_report['duplicates'] = duplicates

    print(f"   Duplicate PMIDs: {duplicates}")

    if duplicates > 0:
        issues_found.append(f"Found {duplicates} duplicate PMIDs")

    # 4. Check date validity
    print("\n4. Checking publication dates...")

    # Extract years from publication dates
    df['year'] = df['publication_date'].str.extract(r'(\d{4})')[0]
    valid_years = df['year'].notna().sum()
    invalid_dates = len(df) - valid_years
    quality_report['invalid_dates'] = invalid_dates

    print(f"   Valid publication years: {valid_years}")
    print(f"   Invalid/missing dates: {invalid_dates}")

    # 5. Check specialty distribution
    print("\n5. Checking specialty distribution...")
    specialty_counts = df['specialty'].value_counts()

    for specialty, count in specialty_counts.items():
        pct = (count / len(df)) * 100
        print(f"   {specialty}: {count} ({pct:.2f}%)")
        quality_report[f'specialty_{specialty}'] = count

    # Check for imbalance
    min_count = specialty_counts.min()
    max_count = specialty_counts.max()
    imbalance_ratio = max_count / min_count if min_count > 0 else 0

    if imbalance_ratio > 3:
        issues_found.append(f"Specialty imbalance detected (ratio: {imbalance_ratio:.2f})")

    # 6. Summary
    if len(issues_found) == 0:
        print("DATA QUALITY: GOOD... No major issues ")
    else:
        print(f"DATA QUALITY: ISSUES FOUND ({len(issues_found)})")
        for issue in issues_found:
            print(f"   - {issue}")


    return quality_report, issues_found


# Run validation
df = pd.read_csv('medical_literature_dataset.csv')
quality_report, issues = validate_data_quality(df)

DATA QUALITY VALIDATION

1. Checking for missing critical fields...
   pmid: 0 missing (0.00%)
   title: 0 missing (0.00%)
   abstract: 0 missing (0.00%)
   journal: 0 missing (0.00%)

2. Checking abstract quality...
   Very short abstracts (<100 chars): 6 (0.31%)
   Very long abstracts (>5000 chars): 0

3. Checking for duplicates...
   Duplicate PMIDs: 1

4. Checking publication dates...
   Valid publication years: 1950
   Invalid/missing dates: 4

5. Checking specialty distribution...
   infectious_diseases: 785 (40.17%)
   diabetes: 728 (37.26%)
   cardiology: 441 (22.57%)
DATA QUALITY: ISSUES FOUND (1)
   - Found 1 duplicate PMIDs


In [14]:
#2. Data Cleaning and Preprocessing (Risk Management)

def clean_medical_data(df):
    """
    Clean and preprocess medical literature data
    Risk Mitigation: Remove low-quality data and handle inconsistencies
    """
    print("DATA CLEANING AND PREPROCESSING")

    original_count = len(df)
    print(f"\nOriginal dataset: {original_count} articles")

    # 1. Remove duplicates
    print("\n1. Removing duplicates...")
    df_clean = df.drop_duplicates(subset=['pmid'], keep='first')
    removed_dupes = original_count - len(df_clean)
    print(f"   Removed {removed_dupes} duplicate articles")

    # 2. Remove articles with missing critical fields
    print("\n2. Removing incomplete articles...")
    df_clean = df_clean.dropna(subset=['pmid', 'title', 'abstract'])
    removed_incomplete = len(df) - removed_dupes - len(df_clean)
    print(f"   Removed {removed_incomplete} incomplete articles")

    # 3. Filter out very short abstracts (likely low quality)
    print("\n3. Filtering short abstracts...")
    df_clean = df_clean[df_clean['abstract_length'] >= 100]
    removed_short = len(df) - removed_dupes - removed_incomplete - len(df_clean)
    print(f"   Removed {removed_short} articles with short abstracts (<100 chars)")

    # 4. Clean text fields
    print("\n4. Cleaning text fields...")

    # Remove special characters and extra whitespace
    df_clean['title'] = df_clean['title'].str.strip()
    df_clean['abstract'] = df_clean['abstract'].str.strip()
    df_clean['abstract'] = df_clean['abstract'].str.replace(r'\s+', ' ', regex=True)

    # 5. Standardize specialty names
    print("\n5. Standardizing specialty names...")
    df_clean['specialty'] = df_clean['specialty'].str.lower().str.strip()

    # 6. Recalculate abstract length after cleaning
    df_clean['abstract_length'] = df_clean['abstract'].str.len()

    print(f" CLEANING COMPLETE")
    print(f"   Original: {original_count} articles")
    print(f"   Cleaned: {len(df_clean)} articles")
    print(f"   Removed: {original_count - len(df_clean)} articles ({((original_count - len(df_clean))/original_count)*100:.2f}%)")

    return df_clean


# Clean the data
df_clean = clean_medical_data(df)

# Save cleaned dataset
df_clean.to_csv('medical_literature_cleaned.csv', index=False)
print("\nCleaned dataset saved: medical_literature_cleaned.csv")

DATA CLEANING AND PREPROCESSING

Original dataset: 1954 articles

1. Removing duplicates...
   Removed 1 duplicate articles

2. Removing incomplete articles...
   Removed 0 incomplete articles

3. Filtering short abstracts...
   Removed 6 articles with short abstracts (<100 chars)

4. Cleaning text fields...

5. Standardizing specialty names...
 CLEANING COMPLETE
   Original: 1954 articles
   Cleaned: 1947 articles
   Removed: 7 articles (0.36%)

Cleaned dataset saved: medical_literature_cleaned.csv


In [15]:
#3. Bias Detection in Medical Literature (Trustworthiness - Fairness

def detect_medical_bias(df):
    """
    Detect potential biases in medical literature corpus
    Trustworthiness: Fairness - Identifies representation biases
    """
    print("BIAS DETECTION IN MEDICAL LITERATURE")

    bias_report = {}

    # 1. Temporal bias - Check distribution across years
    print("\n1. TEMPORAL BIAS ANALYSIS")
    print("   (Are recent studies overrepresented?)")

    df['year'] = df['publication_date'].str.extract(r'(\d{4})')[0]
    year_dist = df['year'].value_counts().sort_index()

    print("\n   Publication year distribution:")
    for year, count in year_dist.items():
        pct = (count / len(df)) * 100
        print(f"   {year}: {count} articles ({pct:.2f}%)")

    # Check if recent years dominate
    if year_dist.index.notna().any():
        recent_years = year_dist[year_dist.index >= '2022'].sum() if '2022' in year_dist.index else 0
        total_with_year = year_dist.sum()
        recent_pct = (recent_years / total_with_year * 100) if total_with_year > 0 else 0

        bias_report['temporal_recent_bias'] = recent_pct

        if recent_pct > 60:
            print(f"\n BIAS DETECTED: Recent years (2022+) represent {recent_pct:.1f}% of data")
        else:
            print(f"\n Temporal distribution acceptable ({recent_pct:.1f}% recent)")

    # 2. Specialty representation bias
    print("\n2. SPECIALTY REPRESENTATION BIAS")
    print("   (Are certain specialties over/underrepresented?)")

    specialty_dist = df['specialty'].value_counts()
    specialty_pct = (specialty_dist / len(df) * 100)

    print("\n   Specialty distribution:")
    for specialty, pct in specialty_pct.items():
        print(f"   {specialty}: {specialty_dist[specialty]} articles ({pct:.2f}%)")

    # Calculate imbalance ratio
    max_count = specialty_dist.max()
    min_count = specialty_dist.min()
    imbalance_ratio = max_count / min_count if min_count > 0 else 0

    bias_report['specialty_imbalance_ratio'] = imbalance_ratio

    if imbalance_ratio > 2:
        print(f"\n  BIAS DETECTED: Specialty imbalance ratio = {imbalance_ratio:.2f}")
        print(f"   (Largest specialty has {imbalance_ratio:.1f}x more articles than smallest)")
    else:
        print(f"\n  Specialty balance acceptable (ratio: {imbalance_ratio:.2f})")

    # 3. Journal diversity
    print("\n3. JOURNAL DIVERSITY ANALYSIS")
    print("   (Is literature from diverse sources?)")

    unique_journals = df['journal'].nunique()
    total_articles = len(df)
    diversity_ratio = unique_journals / total_articles

    print(f"\n   Unique journals: {unique_journals}")
    print(f"   Total articles: {total_articles}")
    print(f"   Diversity ratio: {diversity_ratio:.4f}")

    # Check top journals concentration
    top_10_journals = df['journal'].value_counts().head(10).sum()
    top_10_pct = (top_10_journals / total_articles * 100)

    print(f"   Top 10 journals: {top_10_pct:.1f}% of all articles")

    bias_report['journal_diversity'] = diversity_ratio
    bias_report['top_10_concentration'] = top_10_pct

    if top_10_pct > 50:
        print(f"\n  POTENTIAL BIAS: Top 10 journals dominate ({top_10_pct:.1f}%)")
    else:
        print(f"\n Journal diversity acceptable")

    # 4. Abstract length bias (proxy for study quality/completeness)
    print("\n4. ABSTRACT LENGTH ANALYSIS")
    print("   (Checking for systematic quality differences)")

    specialty_length_stats = df.groupby('specialty')['abstract_length'].agg(['mean', 'std'])

    print("\n   Average abstract length by specialty:")
    for specialty in specialty_length_stats.index:
        mean_len = specialty_length_stats.loc[specialty, 'mean']
        std_len = specialty_length_stats.loc[specialty, 'std']
        print(f"   {specialty}: {mean_len:.0f} ± {std_len:.0f} chars")

    # Check if one specialty has significantly shorter abstracts
    min_mean = specialty_length_stats['mean'].min()
    max_mean = specialty_length_stats['mean'].max()
    length_disparity = (max_mean - min_mean) / min_mean * 100

    bias_report['length_disparity_pct'] = length_disparity

    if length_disparity > 30:
        print(f"\n  POTENTIAL BIAS: {length_disparity:.1f}% difference in abstract lengths")
    else:
        print(f"\n  Abstract lengths consistent across specialties")

    # Summary
    print("BIAS DETECTION SUMMARY")


    biases_found = []
    if bias_report.get('temporal_recent_bias', 0) > 60:
        biases_found.append("Temporal bias (recent years overrepresented)")
    if bias_report.get('specialty_imbalance_ratio', 0) > 2:
        biases_found.append("Specialty imbalance")
    if bias_report.get('top_10_concentration', 0) > 50:
        biases_found.append("Journal concentration")
    if bias_report.get('length_disparity_pct', 0) > 30:
        biases_found.append("Abstract length disparity")

    if len(biases_found) == 0:
        print("No significant biases detected in the dataset")
    else:
        print(f"{len(biases_found)} potential bias(es) detected:")
        for bias in biases_found:
            print(f"   - {bias}")

    return bias_report, biases_found


# Run bias detection
bias_report, biases = detect_medical_bias(df_clean)

BIAS DETECTION IN MEDICAL LITERATURE

1. TEMPORAL BIAS ANALYSIS
   (Are recent studies overrepresented?)

   Publication year distribution:
   2020: 397 articles (20.39%)
   2021: 403 articles (20.70%)
   2022: 324 articles (16.64%)
   2023: 308 articles (15.82%)
   2024: 458 articles (23.52%)
   2025: 53 articles (2.72%)

 Temporal distribution acceptable (58.8% recent)

2. SPECIALTY REPRESENTATION BIAS
   (Are certain specialties over/underrepresented?)

   Specialty distribution:
   infectious_diseases: 782 articles (40.16%)
   diabetes: 726 articles (37.29%)
   cardiology: 439 articles (22.55%)

  Specialty balance acceptable (ratio: 1.78)

3. JOURNAL DIVERSITY ANALYSIS
   (Is literature from diverse sources?)

   Unique journals: 792
   Total articles: 1947
   Diversity ratio: 0.4068
   Top 10 journals: 14.7% of all articles

 Journal diversity acceptable

4. ABSTRACT LENGTH ANALYSIS
   (Checking for systematic quality differences)

   Average abstract length by specialty:
   card

In [16]:
#4. Data Privacy Compliance Check (Trustworthiness - Privacy)

def check_privacy_compliance(df):
    """
    Verify data privacy compliance for medical literature
    Trustworthiness: Privacy - Ensures no sensitive information leakage
    """
    print("DATA PRIVACY COMPLIANCE CHECK")


    privacy_issues = []

    # 1. Check for potential patient identifiers in abstracts
    print("\n1. Scanning for potential patient identifiers...")

    # Patterns that might indicate case reports with patient info
    identifier_patterns = {
        'age_gender': r'\b\d{1,2}[-\s]year[-\s]old\s+(male|female|man|woman)\b',
        'specific_dates': r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b',
        'initials': r'\bpatient\s+[A-Z]\.[A-Z]\.\b',
        'case_report': r'\bcase\s+report\b'
    }

    findings = {}

    for pattern_name, pattern in identifier_patterns.items():
        matches = df['abstract'].str.contains(pattern, case=False, regex=True, na=False).sum()
        findings[pattern_name] = matches

        if matches > 0:
            pct = (matches / len(df)) * 100
            print(f"   {pattern_name}: {matches} abstracts ({pct:.2f}%)")

            if pattern_name != 'case_report' and matches > len(df) * 0.01:  # >1% threshold
                privacy_issues.append(f"High occurrence of {pattern_name}: {matches} cases")

    # 2. Verify data source is public
    print("\n2. Verifying data source...")
    print(" Data source: PubMED (publicly available medical literature)")
    print(" No patient-specific data collected")
    print(" Only published, peer-reviewed abstracts included")

    # 3. Check for any email addresses or URLs (shouldn't be present)
    print("\n3. Checking for inappropriate content...")

    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails_found = df['abstract'].str.contains(email_pattern, regex=True, na=False).sum()

    if emails_found > 0:
        print(f"  Found {emails_found} abstracts with email addresses")
        privacy_issues.append(f"Email addresses found: {emails_found}")
    else:
        print(" No email addresses found")

    # Summary
    print("PRIVACY COMPLIANCE SUMMARY")

    if len(privacy_issues) == 0:
        print("COMPLIANT: No privacy issues detected")
        print("  - Public domain medical literature only")
        print("  - No patient identifiers found")
        print("  - Appropriate for AI training use")
    else:
        print(f" {len(privacy_issues)} potential privacy concern(s):")
        for issue in privacy_issues:
            print(f"   - {issue}")
        print("\n  Recommendation: Review flagged abstracts manually")


    return findings, privacy_issues


# Run privacy check
privacy_findings, privacy_issues = check_privacy_compliance(df_clean)

DATA PRIVACY COMPLIANCE CHECK

1. Scanning for potential patient identifiers...
   age_gender: 1 abstracts (0.05%)
   specific_dates: 47 abstracts (2.41%)

2. Verifying data source...
 Data source: PubMED (publicly available medical literature)
 No patient-specific data collected
 Only published, peer-reviewed abstracts included

3. Checking for inappropriate content...
  Found 1 abstracts with email addresses
PRIVACY COMPLIANCE SUMMARY
 2 potential privacy concern(s):
   - High occurrence of specific_dates: 47 cases
   - Email addresses found: 1

  Recommendation: Review flagged abstracts manually


  matches = df['abstract'].str.contains(pattern, case=False, regex=True, na=False).sum()


In [17]:
#5. Data Representativeness Analysis (Risk Management)

def analyze_data_representativeness(df):
    """
    Analyze whether data represents diverse medical knowledge
    Risk Mitigation: Ensures model will generalize across medical domains
    """
    print("DATA REPRESENTATIVENESS ANALYSIS")

    # 1. Specialty coverage
    print("\n1. MEDICAL SPECIALTY COVERAGE")

    total_articles = len(df)
    specialty_coverage = df['specialty'].value_counts()

    print(f"\n   Total articles: {total_articles}")
    print(f"   Specialties covered: {len(specialty_coverage)}")
    print("\n   Distribution:")

    for specialty, count in specialty_coverage.items():
        pct = (count / total_articles) * 100
        bar = '█' * int(pct / 2)
        print(f"   {specialty:20s}: {count:5d} ({pct:5.1f}%) {bar}")

    # 2. Temporal coverage
    print("\n2. TEMPORAL COVERAGE")

    df['year'] = df['publication_date'].str.extract(r'(\d{4})')[0]
    year_coverage = df['year'].value_counts().sort_index()

    print(f"\n   Years covered: {year_coverage.index.min()} - {year_coverage.index.max()}")
    print("\n   Distribution by year:")

    for year, count in year_coverage.items():
        pct = (count / total_articles) * 100
        bar = '█' * int(pct / 3)
        print(f"   {year}: {count:5d} ({pct:5.1f}%) {bar}")

    # 3. Journal diversity
    print("\n3. JOURNAL DIVERSITY")

    unique_journals = df['journal'].nunique()
    print(f"\n   Unique journals: {unique_journals}")
    print(f"   Top 10 journals:")

    top_journals = df['journal'].value_counts().head(10)
    for journal, count in top_journals.items():
        pct = (count / total_articles) * 100
        print(f"   {journal[:40]:40s}: {count:4d} ({pct:4.1f}%)")

    # 4. Content diversity (abstract length as proxy)
    print("\n4. CONTENT DIVERSITY")

    length_stats = df['abstract_length'].describe()
    print(f"\n   Abstract length statistics:")
    print(f"   Mean: {length_stats['mean']:.0f} characters")
    print(f"   Std:  {length_stats['std']:.0f} characters")
    print(f"   Min:  {length_stats['min']:.0f} characters")
    print(f"   Max:  {length_stats['max']:.0f} characters")

    # Assess representativeness
    print("REPRESENTATIVENESS ASSESSMENT")

    issues = []

    # Check minimum articles per specialty
    min_articles = specialty_coverage.min()
    if min_articles < 1000:
        issues.append(f"Low representation in some specialties (min: {min_articles})")

    # Check journal concentration
    top_10_pct = (top_journals.sum() / total_articles) * 100
    if top_10_pct > 50:
        issues.append(f"High journal concentration (top 10: {top_10_pct:.1f}%)")

    # Check temporal coverage
    years_covered = len(year_coverage)
    if years_covered < 3:
        issues.append(f"Limited temporal coverage ({years_covered} years)")

    if len(issues) == 0:
        print("GOOD: Dataset shows good representativeness")
        print("  - Multiple specialties covered")
        print("  - Diverse journal sources")
        print("  - Adequate temporal range")
    else:
        print(f"{len(issues)} representativeness concern(s):")
        for issue in issues:
            print(f"   - {issue}")

    return {
        'total_articles': total_articles,
        'specialties': len(specialty_coverage),
        'journals': unique_journals,
        'years_covered': years_covered,
        'issues': issues
    }


# Run representativeness analysis
rep_analysis = analyze_data_representativeness(df_clean)

DATA REPRESENTATIVENESS ANALYSIS

1. MEDICAL SPECIALTY COVERAGE

   Total articles: 1947
   Specialties covered: 3

   Distribution:
   infectious_diseases :   782 ( 40.2%) ████████████████████
   diabetes            :   726 ( 37.3%) ██████████████████
   cardiology          :   439 ( 22.5%) ███████████

2. TEMPORAL COVERAGE

   Years covered: 2020 - 2025

   Distribution by year:
   2020:   397 ( 20.4%) ██████
   2021:   403 ( 20.7%) ██████
   2022:   324 ( 16.6%) █████
   2023:   308 ( 15.8%) █████
   2024:   458 ( 23.5%) ███████
   2025:    53 (  2.7%) 

3. JOURNAL DIVERSITY

   Unique journals: 792
   Top 10 journals:
   Primary care diabetes                   :   58 ( 3.0%)
   PloS one                                :   33 ( 1.7%)
   Scientific reports                      :   28 ( 1.4%)
   Journal of the American College of Cardi:   26 ( 1.3%)
   Pediatric cardiology                    :   25 ( 1.3%)
   Frontiers in endocrinology              :   25 ( 1.3%)
   International journ

## **Complete Data Collection-Risk Management Report**

In [18]:
#Data Collection Report

def generate_data_collection_report(df, quality_report, bias_report, privacy_findings, rep_analysis):
    """
    Generate comprehensive risk management and trustworthiness report
    """
    print("DATA COLLECTION - RISK MANAGEMENT & TRUSTWORTHINESS REPORT")
    print(f"\nReport generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


    print("1. DATASET OVERVIEW")
    print(f"Total articles collected: {len(df)}")
    print(f"Medical specialties: {rep_analysis['specialties']}")
    print(f"Unique journals: {rep_analysis['journals']}")
    print(f"Temporal coverage: {rep_analysis['years_covered']} years")

    print("2. DATA QUALITY STATUS")
    print(f"Duplicate PMIDs removed: {quality_report.get('duplicates', 0)}")
    print(f"Short abstracts filtered: {quality_report.get('short_abstracts', 0)}")
    print(f"Missing critical fields: {quality_report.get('missing_pmid', 0)}")
    print("Overall quality: ACCEPTABLE" if quality_report.get('duplicates', 0) < 100 else "Overall quality: NEEDS ATTENTION")


    print("3. BIAS DETECTION RESULTS")
    print(f"Specialty imbalance ratio: {bias_report.get('specialty_imbalance_ratio', 0):.2f}")
    print(f"Journal concentration (top 10): {bias_report.get('top_10_concentration', 0):.1f}%")
    print(f"Temporal bias (recent years): {bias_report.get('temporal_recent_bias', 0):.1f}%")
    bias_status = "MINIMAL BIAS" if bias_report.get('specialty_imbalance_ratio', 0) < 2 else "BIAS DETECTED"
    print(f"Bias assessment: {bias_status}")

    print("4. PRIVACY COMPLIANCE")
    print("Data source: PubMED (public domain)")
    print("Patient identifiers: Not applicable (published abstracts)")
    print(f"Case reports detected: {privacy_findings.get('case_report', 0)} abstracts")
    print("Privacy compliance: COMPLIANT")

    print("5. REPRESENTATIVENESS")
    print(f"Specialty coverage: {rep_analysis['specialties']} specialties")
    print(f"Journal diversity: {rep_analysis['journals']} unique sources")
    print(f"Issues identified: {len(rep_analysis.get('issues', []))}")
    rep_status = " REPRESENTATIVE" if len(rep_analysis.get('issues', [])) == 0 else "LIMITED COVERAGE"
    print(f"Representativeness: {rep_status}")

    print("6. RISK MANAGEMENT SUMMARY")
    print("Data quality validation completed")
    print(" Bias detection analysis performed")
    print(" Privacy compliance verified")
    print(" Representativeness assessed")
    print(" Data cleaning and preprocessing applied")

    print("7. RECOMMENDATIONS")

    if bias_report.get('specialty_imbalance_ratio', 0) > 2:
        print("Consider collecting more data for underrepresented specialties")

    if bias_report.get('top_10_concentration', 0) > 50:
        print("Expand journal sources to improve diversity")

    if rep_analysis.get('total_articles', 0) < 10000:
        print("Increase dataset size for better model training")

    if len(rep_analysis.get('issues', [])) == 0 and bias_report.get('specialty_imbalance_ratio', 0) < 2:
        print("Dataset ready for model development")
        print("No critical issues identified")


    # Save report to file
    report_filename = f"data_collection_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

    with open(report_filename, 'w') as f:
        f.write("="*60 + "\n")
        f.write("DATA COLLECTION - RISK MANAGEMENT & TRUSTWORTHINESS REPORT\n")
        f.write("="*60 + "\n")
        f.write(f"\nReport generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"\nTotal articles: {len(df)}\n")
        f.write(f"Quality issues: {quality_report.get('duplicates', 0)} duplicates\n")
        f.write(f"Bias ratio: {bias_report.get('specialty_imbalance_ratio', 0):.2f}\n")
        f.write(f"Privacy compliance: COMPLIANT\n")
        f.write(f"Representativeness: {rep_status}\n")

    print(f"Report saved: {report_filename}\n")


# Generate comprehensive report
generate_data_collection_report(
    df_clean,
    quality_report,
    bias_report,
    privacy_findings,
    rep_analysis
)

DATA COLLECTION - RISK MANAGEMENT & TRUSTWORTHINESS REPORT

Report generated: 2025-11-02 18:34:14
1. DATASET OVERVIEW
Total articles collected: 1947
Medical specialties: 3
Unique journals: 792
Temporal coverage: 6 years
2. DATA QUALITY STATUS
Duplicate PMIDs removed: 1
Short abstracts filtered: 6
Missing critical fields: 0
Overall quality: ACCEPTABLE
3. BIAS DETECTION RESULTS
Specialty imbalance ratio: 1.78
Journal concentration (top 10): 14.7%
Temporal bias (recent years): 58.8%
Bias assessment: MINIMAL BIAS
4. PRIVACY COMPLIANCE
Data source: PubMED (public domain)
Patient identifiers: Not applicable (published abstracts)
Case reports detected: 0 abstracts
Privacy compliance: COMPLIANT
5. REPRESENTATIVENESS
Specialty coverage: 3 specialties
Journal diversity: 792 unique sources
Issues identified: 1
Representativeness: LIMITED COVERAGE
6. RISK MANAGEMENT SUMMARY
Data quality validation completed
 Bias detection analysis performed
 Privacy compliance verified
 Representativeness assesse

## **Vector Embedding Generation**

In [21]:
# embedding_generation.py

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import pickle
from tqdm import tqdm
import os
from datetime import datetime

print("EMBEDDING GENERATION")
'''
# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nUsing device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Note: Using CPU. This will be slower. GPU recommended for faster processing.")'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nUsing device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Note: Using CPU (slower)")

# Load your dataset
print("Loading dataset")

df = pd.read_csv('medical_literature_dataset.csv')

print(f"Loaded {len(df)} articles")
print(f" Columns: {list(df.columns)}")
print(f"\nDataset preview:")
print(df[['pmid', 'title', 'specialty']].head())

# Check for any missing abstracts
missing_abstracts = df['abstract'].isnull().sum()
if missing_abstracts > 0:
    print(f"\n Warning: {missing_abstracts} articles have missing abstracts")
    df = df.dropna(subset=['abstract'])
    print(f"Filtered to {len(df)} articles with abstracts")

print(f"\nSpecialty distribution:")
print(df['specialty'].value_counts())

EMBEDDING GENERATION

Using device: cpu
Note: Using CPU (slower)
Loading dataset
Loaded 1954 articles
 Columns: ['pmid', 'title', 'abstract', 'journal', 'publication_date', 'authors', 'num_authors', 'abstract_length', 'specialty', 'collection_date']

Dataset preview:
       pmid                                              title   specialty
0  33332149  2020 ACC/AHA Guideline for the Management of P...  cardiology
1  32370835  Artificial Intelligence in Cardiology: Present...  cardiology
2  34338485                   Machine learning for cardiology.  cardiology
3  32216916  Evaluation for Heart Transplantation and LVAD ...  cardiology
4  38593946  Artificial Intelligence for Cardiovascular Car...  cardiology

Specialty distribution:
specialty
infectious_diseases    785
diabetes               728
cardiology             441
Name: count, dtype: int64


In [22]:
#Loading Language Model
#Using SentenceTransformers with medical fine-tuning
model_name = 'pritamdeka/S-PubMedBert-MS-MARCO'


print(f"Model: {model_name}")
print("Loading model..")

try:
    model = SentenceTransformer(model_name, device=device)
    print("Model loaded successfully!")
    print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("\nTrying fallback model: all-MiniLM-L6-v2")
    model_name = 'all-MiniLM-L6-v2'
    model = SentenceTransformer(model_name, device=device)
    print(" Fallback model loaded successfully!")

Model: pritamdeka/S-PubMedBert-MS-MARCO
Loading model..


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!
Embedding dimension: 768


In [23]:
# Prepare text for embedding
print("\nPreparing text for embedding")

# We combine title + abstract for richer context
#Title + Abstract
texts = (df['title'] + ' ' + df['abstract']).tolist()

print(f" Prepared {len(texts)} texts for embedding")
print(f" Average text length: {np.mean([len(t) for t in texts]):.0f} characters")

# Generate embeddings with progress bar
print("\nGenerating embeddings")

batch_size = 64

embeddings = model.encode(
    texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # Normalize for better similarity search
)

print(f"\n Embeddings generated successfully!")
print(f" Shape: {embeddings.shape}")
print(f" Data type: {embeddings.dtype}")
print(f" Memory size: {embeddings.nbytes / (1024*1024):.2f} MB")


Preparing text for embedding
 Prepared 1954 texts for embedding
 Average text length: 1476 characters

Generating embeddings


Batches:   0%|          | 0/31 [00:00<?, ?it/s]


 Embeddings generated successfully!
 Shape: (1954, 768)
 Data type: float32
 Memory size: 5.72 MB


In [24]:
print("Validating Embeddings")

# Checking for any NaN or infinite values
nan_count = np.isnan(embeddings).sum()
inf_count = np.isinf(embeddings).sum()

print(f" NaN values: {nan_count}")
print(f" Infinite values: {inf_count}")

if nan_count > 0 or inf_count > 0:
    print("Warning: Found invalid values in embeddings!")
else:
    print("All embeddings are valid")

# Checking embedding statistics
print(f"\nEmbedding statistics:")
print(f"  Mean: {embeddings.mean():.4f}")
print(f"  Std: {embeddings.std():.4f}")
print(f"  Min: {embeddings.min():.4f}")
print(f"  Max: {embeddings.max():.4f}")

# Testing similarity between two random abstracts
print("Testing Embedding Similarity...")

# And, calculating cosine similarity between first 2 documents
from sklearn.metrics.pairwise import cosine_similarity

sample_idx1 = 0
sample_idx2 = 1

similarity = cosine_similarity(
    embeddings[sample_idx1].reshape(1, -1),
    embeddings[sample_idx2].reshape(1, -1)
)[0][0]

print(f"\nSample articles:")
print(f"\nArticle 1 (PMID: {df.iloc[sample_idx1]['pmid']}):")
print(f"  Specialty: {df.iloc[sample_idx1]['specialty']}")
print(f"  Title: {df.iloc[sample_idx1]['title'][:80]}...")

print(f"\nArticle 2 (PMID: {df.iloc[sample_idx2]['pmid']}):")
print(f"  Specialty: {df.iloc[sample_idx2]['specialty']}")
print(f"  Title: {df.iloc[sample_idx2]['title'][:80]}...")

print(f"\nCosine Similarity: {similarity:.4f}")
print("(1.0 = identical, 0.0 = unrelated, -1.0 = opposite)")

# Find most similar article to first one
similarities = cosine_similarity(
    embeddings[0].reshape(1, -1),
    embeddings
)[0]

# Get top 5 most similar (excluding itself)
top_5_indices = np.argsort(similarities)[-6:-1][::-1]  # Exclude itself, get top 5

print(f"\nTop 5 most similar articles to Article 1:")
for rank, idx in enumerate(top_5_indices, 1):
    print(f"\n{rank}. PMID: {df.iloc[idx]['pmid']} (Similarity: {similarities[idx]:.4f})")
    print(f"   Specialty: {df.iloc[idx]['specialty']}")
    print(f"   Title: {df.iloc[idx]['title'][:80]}...")

Validating Embeddings
 NaN values: 0
 Infinite values: 0
All embeddings are valid

Embedding statistics:
  Mean: -0.0013
  Std: 0.0361
  Min: -0.7898
  Max: 0.2477
Testing Embedding Similarity...

Sample articles:

Article 1 (PMID: 33332149):
  Specialty: cardiology
  Title: 2020 ACC/AHA Guideline for the Management of Patients With Valvular Heart Diseas...

Article 2 (PMID: 32370835):
  Specialty: cardiology
  Title: Artificial Intelligence in Cardiology: Present and Future....

Cosine Similarity: 0.9205
(1.0 = identical, 0.0 = unrelated, -1.0 = opposite)

Top 5 most similar articles to Article 1:

1. PMID: 33229115 (Similarity: 0.9742)
   Specialty: cardiology
   Title: 2020 AHA/ACC Guideline for the Diagnosis and Treatment of Patients With Hypertro...

2. PMID: 34895950 (Similarity: 0.9628)
   Specialty: cardiology
   Title: 2021 ACC/AHA/SCAI Guideline for Coronary Artery Revascularization: A Report of t...

3. PMID: 38718139 (Similarity: 0.9591)
   Specialty: cardiology
   Title: 2

In [25]:
print("Saving Embeddings and Metadata")
# Creating output directory
output_dir = 'embeddings'
os.makedirs(output_dir, exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Saving embeddings as numpy array
embeddings_file = f'{output_dir}/medical_embeddings_{timestamp}.npy'
np.save(embeddings_file, embeddings)
print(f"Embeddings saved: {embeddings_file}")

# Saving metadata (PMIDs, titles, abstracts, specialties)
metadata = df[['pmid', 'title', 'abstract', 'specialty', 'journal', 'publication_date']].copy()
metadata_file = f'{output_dir}/medical_metadata_{timestamp}.csv'
metadata.to_csv(metadata_file, index=False)
print(f" Metadata saved: {metadata_file}")

# pickle file for saving the model (easy use)
pickle_file = f'{output_dir}/medical_embeddings_complete_{timestamp}.pkl'
embedding_data = {
    'embeddings': embeddings,
    'metadata': metadata,
    'model_name': model_name,
    'embedding_dim': embeddings.shape[1],
    'num_documents': len(embeddings),
    'creation_date': timestamp
}

with open(pickle_file, 'wb') as f:
    pickle.dump(embedding_data, f)
print(f" Complete data saved: {pickle_file}")

# configuration file
config_file = f'{output_dir}/embedding_config_{timestamp}.txt'
with open(config_file, 'w') as f:
    f.write("MEDICAL RAG SYSTEM - EMBEDDING CONFIGURATION\n")
    f.write(f"Generation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Model Name: {model_name}\n")
    f.write(f"Embedding Dimension: {embeddings.shape[1]}\n")
    f.write(f"Number of Documents: {len(embeddings)}\n")
    f.write(f"Batch Size: {batch_size}\n")
    f.write(f"Normalization: True\n")
    f.write(f"\nDataset Statistics:\n")
    f.write(f"  Total Articles: {len(df)}\n")
    f.write(f"  Specialties: {df['specialty'].nunique()}\n")
    f.write(f"  Specialty Distribution:\n")
    for specialty, count in df['specialty'].value_counts().items():
        f.write(f"    - {specialty}: {count} articles\n")

print(f" Configuration saved: {config_file}")

print("EMBEDDING GENERATION COMPLETE!")
print(f"\nOutput files in '{output_dir}/' directory:")
print(f"  1. {embeddings_file}")
print(f"  2. {metadata_file}")
print(f"  3. {pickle_file}")
print(f"  4. {config_file}")
print(f"\nTotal size: {(embeddings.nbytes + metadata.memory_usage(deep=True).sum()) / (1024*1024):.2f} MB")

Saving Embeddings and Metadata
Embeddings saved: embeddings/medical_embeddings_20251102_190752.npy
 Metadata saved: embeddings/medical_metadata_20251102_190752.csv
 Complete data saved: embeddings/medical_embeddings_complete_20251102_190752.pkl
 Configuration saved: embeddings/embedding_config_20251102_190752.txt
EMBEDDING GENERATION COMPLETE!

Output files in 'embeddings/' directory:
  1. embeddings/medical_embeddings_20251102_190752.npy
  2. embeddings/medical_metadata_20251102_190752.csv
  3. embeddings/medical_embeddings_complete_20251102_190752.pkl
  4. embeddings/embedding_config_20251102_190752.txt

Total size: 10.38 MB


In [26]:
print("TESTING EMBEDDINGS WITH SAMPLE QUERIES")

# example queries
test_queries = [
    "What are the symptoms of diabetes?",
    "How is heart disease treated?",
    "What causes COVID-19 infection?"
]

print("\nRunning test queries...\n")

for i, query in enumerate(test_queries, 1):
    print(f"\n{'-'*60}")
    print(f"Query {i}: {query}")

    # Generate query embedding
    query_embedding = model.encode([query], normalize_embeddings=True)

    # Calculate similarities
    similarities = cosine_similarity(
        query_embedding,
        embeddings
    )[0]

    # Get top 3 results
    top_3 = np.argsort(similarities)[-3:][::-1]

    print(f"\nTop 3 relevant articles:\n")
    for rank, idx in enumerate(top_3, 1):
        print(f"{rank}. Similarity: {similarities[idx]:.4f}")
        print(f"   PMID: {df.iloc[idx]['pmid']}")
        print(f"   Specialty: {df.iloc[idx]['specialty']}")
        print(f"   Title: {df.iloc[idx]['title']}")
        print(f"   Abstract preview: {df.iloc[idx]['abstract'][:150]}...")
        print()
print("EMBEDDING GENERATION AND TESTING is done!")

TESTING EMBEDDINGS WITH SAMPLE QUERIES

Running test queries...


------------------------------------------------------------
Query 1: What are the symptoms of diabetes?

Top 3 relevant articles:

1. Similarity: 0.9470
   PMID: 37559237
   Specialty: diabetes
   Title: Novel Approaches to Control Diabetes.
   Abstract preview: Diabetes is a chronic, long-term, incurable, but controllable condition. Diabetes mellitus (DM) is a group of metabolic disorders characterized by hyp...

2. Similarity: 0.9419
   PMID: 39556629
   Specialty: diabetes
   Title: Diabetic Ketoacidosis: Evaluation and Treatment.
   Abstract preview: Diabetic ketoacidosis (DKA) is a life-threatening complication of type 1 and type 2 diabetes resulting from an absolute or relative insulin deficiency...

3. Similarity: 0.9391
   PMID: 33969646
   Specialty: diabetes
   Title: Recognising and treating psychological issues in people with diabetes mellitus.
   Abstract preview: Diabetes mellitus is a long-term condition 

In [27]:
def load_embeddings(embeddings_dir='embeddings'):
    """
    Load the most recent embeddings and metadata
    """
    import os
    import glob
    import pickle

    # Find most recent pickle file
    pickle_files = glob.glob(f'{embeddings_dir}/medical_embeddings_complete_*.pkl')

    if not pickle_files:
        raise FileNotFoundError(f"No embedding files found in {embeddings_dir}/")

    # Get most recent file
    latest_file = max(pickle_files, key=os.path.getctime)

    print(f"Loading embeddings from: {latest_file}")

    with open(latest_file, 'rb') as f:
        data = pickle.load(f)

    print(f" Loaded {data['num_documents']} document embeddings")
    print(f" Embedding dimension: {data['embedding_dim']}")
    print(f" Model: {data['model_name']}")

    return data


def test_similarity_search(query_text, embeddings_data, model, top_k=5):
    """
    Test similarity search with a query
    """
    from sklearn.metrics.pairwise import cosine_similarity

    # Generate query embedding
    query_embedding = model.encode([query_text], normalize_embeddings=True)

    # Calculating similarities
    similarities = cosine_similarity(
        query_embedding,
        embeddings_data['embeddings']
    )[0]

    # to Get top-k results
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    results = []
    for idx in top_indices:
        result = {
            'pmid': embeddings_data['metadata'].iloc[idx]['pmid'],
            'title': embeddings_data['metadata'].iloc[idx]['title'],
            'abstract': embeddings_data['metadata'].iloc[idx]['abstract'],
            'specialty': embeddings_data['metadata'].iloc[idx]['specialty'],
            'similarity': similarities[idx]
        }
        results.append(result)

    return results


# Save this helper file
print("Creating utils")

helper_code = '''
# embedding_utils.py
import pickle
import glob
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_embeddings(embeddings_dir='embeddings'):
    """Load the most recent embeddings and metadata"""
    pickle_files = glob.glob(f'{embeddings_dir}/medical_embeddings_complete_*.pkl')

    if not pickle_files:
        raise FileNotFoundError(f"No embedding files found in {embeddings_dir}/")

    latest_file = max(pickle_files, key=os.path.getctime)
    print(f"Loading embeddings from: {latest_file}")

    with open(latest_file, 'rb') as f:
        data = pickle.load(f)

    print(f"✓ Loaded {data['num_documents']} document embeddings")
    return data

def test_similarity_search(query_text, embeddings_data, model, top_k=5):
    """Test similarity search with a query"""
    query_embedding = model.encode([query_text], normalize_embeddings=True)

    similarities = cosine_similarity(
        query_embedding,
        embeddings_data['embeddings']
    )[0]

    top_indices = np.argsort(similarities)[-top_k:][::-1]

    results = []
    for idx in top_indices:
        result = {
            'pmid': embeddings_data['metadata'].iloc[idx]['pmid'],
            'title': embeddings_data['metadata'].iloc[idx]['title'],
            'abstract': embeddings_data['metadata'].iloc[idx]['abstract'][:200] + '...',
            'specialty': embeddings_data['metadata'].iloc[idx]['specialty'],
            'similarity': float(similarities[idx])
        }
        results.append(result)

    return results
'''

with open('embedding_utils.py', 'w') as f:
    f.write(helper_code)

print("Created embedding_utils.py")

Creating utils
Created embedding_utils.py


## **Vector Database Setup with FAISS**

In [28]:
import numpy as np
import pandas as pd
import faiss
import pickle
import os
from datetime import datetime
from embedding_utils import load_embeddings

print("MEDICAL RAG SYSTEM - FAISS VECTOR DATABASE SETUP")
embeddings_data = load_embeddings('embeddings')

embeddings = embeddings_data['embeddings']
metadata = embeddings_data['metadata']
embedding_dim = embeddings_data['embedding_dim']

print(f"\n Loaded embeddings:")
print(f"Documents: {len(embeddings)}")
print(f"Dimension: {embedding_dim}")
print(f"Shape: {embeddings.shape}")

# Verify embeddings are normalized
norms = np.linalg.norm(embeddings, axis=1)
print(f"Normalized: {np.allclose(norms, 1.0)}")
print(f"Average norm: {norms.mean():.4f}")

MEDICAL RAG SYSTEM - FAISS VECTOR DATABASE SETUP
Loading embeddings from: embeddings/medical_embeddings_complete_20251102_190752.pkl
✓ Loaded 1954 document embeddings

 Loaded embeddings:
Documents: 1954
Dimension: 768
Shape: (1954, 768)
Normalized: True
Average norm: 1.0000


In [29]:
print("Building FAISS Index")

# Choosing index type based on dataset size
num_documents = len(embeddings)
# For small datasets: Use Flat index (exact search, slower but accurate)
# For larger datasets: Use IVF index (approximate search, faster)

if num_documents < 10000:
    
    print(f"\nDataset size: {num_documents} documents")
    print("Using: IndexFlatIP (Flat Inner Product - Exact Search)")
    index = faiss.IndexFlatIP(embedding_dim)

else:
    print(f"\nDataset size: {num_documents} documents")
    print("Using: IndexIVFFlat (Inverted File Index - Approximate Search)")

    # clusteringggg
    nlist = int(np.sqrt(num_documents))
    nlist = max(nlist, 10)

    quantizer = faiss.IndexFlatIP(embedding_dim)
    index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist)

    print(f"Number of clusters: {nlist}")

    # Train the index
    print("Training index")
    index.train(embeddings.astype('float32'))
    print(" Index trained")

# Add vectors to index
print("\nAdding embeddings to index")
index.add(embeddings.astype('float32'))

print(f"Index built successfully!")
print(f"Total vectors in index: {index.ntotal}")

Building FAISS Index

Dataset size: 1954 documents
Using: IndexFlatIP (Flat Inner Product - Exact Search)

Adding embeddings to index
Index built successfully!
Total vectors in index: 1954


In [31]:
print("Testing FAISS Search")

# Test(Sample)
test_query = "What are the symptoms of diabetes mellitus?"
print(f"\nTest Query: '{test_query}'")

# Loading model to generate query embedding
from sentence_transformers import SentenceTransformer

model_name = embeddings_data['model_name']
model = SentenceTransformer(model_name)
print(f"Model loaded for query embedding: {model_name}")

# Generating query embedding
query_embedding = model.encode([test_query], normalize_embeddings=True)
query_embedding = query_embedding.astype('float32')

# Search FAISS index
k = 5
print(f"\nSearching for top {k} most similar documents")
distances, indices = index.search(query_embedding, k)

print(f"\nTop {k} Results:\n")

for rank, (idx, distance) in enumerate(zip(indices[0], distances[0]), 1):
    print(f"{rank}. Similarity Score: {distance:.4f}")
    print(f"   PMID: {metadata.iloc[idx]['pmid']}")
    print(f"   Specialty: {metadata.iloc[idx]['specialty']}")
    print(f"   Title: {metadata.iloc[idx]['title']}")
    print(f"   Abstract: {metadata.iloc[idx]['abstract'][:150]}...")
    print()

Testing FAISS Search

Test Query: 'What are the symptoms of diabetes mellitus?'
Model loaded for query embedding: pritamdeka/S-PubMedBert-MS-MARCO

Searching for top 5 most similar documents

Top 5 Results:

1. Similarity Score: 0.9488
   PMID: 37559237
   Specialty: diabetes
   Title: Novel Approaches to Control Diabetes.
   Abstract: Diabetes is a chronic, long-term, incurable, but controllable condition. Diabetes mellitus (DM) is a group of metabolic disorders characterized by hyp...

2. Similarity Score: 0.9415
   PMID: 34708622
   Specialty: diabetes
   Title: Diabetes mellitus: an overview of the types, symptoms, complications and management.
   Abstract: The incidence of diabetes mellitus is rapidly increasing, and this condition often results in significant metabolic disease and severe complications. ...

3. Similarity Score: 0.9394
   PMID: 39556629
   Specialty: diabetes
   Title: Diabetic Ketoacidosis: Evaluation and Treatment.
   Abstract: Diabetic ketoacidosis (DKA) is a l

In [32]:
print("Creating Retrieval Functions")

def search_medical_literature(query, index, metadata, model, top_k=5):
    """
    Search medical literature using FAISS index

    Args:
        query (str): Medical question or search query
        index: FAISS index
        metadata (DataFrame): Document metadata
        model: SentenceTransformer model
        top_k (int): Number of results to return

    Returns:
        list: List of dictionaries with results
    """
    # Generating query embedding
    query_embedding = model.encode([query], normalize_embeddings=True)
    query_embedding = query_embedding.astype('float32')

    # Search index
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for rank, (idx, score) in enumerate(zip(indices[0], distances[0]), 1):
        result = {
            'rank': rank,
            'pmid': metadata.iloc[idx]['pmid'],
            'title': metadata.iloc[idx]['title'],
            'abstract': metadata.iloc[idx]['abstract'],
            'specialty': metadata.iloc[idx]['specialty'],
            'journal': metadata.iloc[idx]['journal'],
            'publication_date': metadata.iloc[idx]['publication_date'],
            'similarity_score': float(score)
        }
        results.append(result)

    return results


def search_by_specialty(query, index, metadata, model, specialty, top_k=5):
    """
    Search medical literature filtered by specialty

    Args:
        query (str): Medical question
        index: FAISS index
        metadata (DataFrame): Document metadata
        model: SentenceTransformer model
        specialty (str): Medical specialty to filter
        top_k (int): Number of results

    Returns:
        list: Filtered results
    """
    # Getingg more results initially for filtering
    initial_k = top_k * 3

    query_embedding = model.encode([query], normalize_embeddings=True)
    query_embedding = query_embedding.astype('float32')

    # Search index
    distances, indices = index.search(query_embedding, initial_k)

    # Filter by specialty
    results = []
    for idx, score in zip(indices[0], distances[0]):
        if metadata.iloc[idx]['specialty'] == specialty:
            result = {
                'pmid': metadata.iloc[idx]['pmid'],
                'title': metadata.iloc[idx]['title'],
                'abstract': metadata.iloc[idx]['abstract'],
                'specialty': metadata.iloc[idx]['specialty'],
                'journal': metadata.iloc[idx]['journal'],
                'similarity_score': float(score)
            }
            results.append(result)

            if len(results) >= top_k:
                break

    return results


def batch_search(queries, index, metadata, model, top_k=5):
    """
    Search multiple queries at once

    Args:
        queries (list): List of query strings
        index: FAISS index
        metadata (DataFrame): Document metadata
        model: SentenceTransformer model
        top_k (int): Number of results per query

    Returns:
        dict: Dictionary mapping queries to results
    """
    results_dict = {}

    for query in queries:
        results = search_medical_literature(query, index, metadata, model, top_k)
        results_dict[query] = results

    return results_dict

print(" Created: search_medical_literature(), search_by_specialty(), batch_search()")

Creating Retrieval Functions
 Created: search_medical_literature(), search_by_specialty(), batch_search()


In [33]:
print("Testing Retrieval Functions")
#Test-1: Basic Medical Search

query1 = "How is diabetes managed with insulin therapy?"
results1 = search_medical_literature(query1, index, metadata, model, top_k=3)

print(f"Query: {query1}\n")
for result in results1:
    print(f"Rank {result['rank']}. [{result['specialty']}] Score: {result['similarity_score']:.4f}")
    print(f"   Title: {result['title']}")
    print(f"   PMID: {result['pmid']}")
    print()

# Test-2: Specialty-filtered search
query2 = "What are treatment options for heart failure?"
specialty_filter = "cardiology"
results2 = search_by_specialty(query2, index, metadata, model, specialty_filter, top_k=3)

print(f"Query: {query2}")
print(f"Filter: {specialty_filter}\n")

if len(results2) > 0:
    for i, result in enumerate(results2, 1):
        print(f"{i}. [{result['specialty']}] Score: {result['similarity_score']:.4f}")
        print(f"   Title: {result['title']}")
        print(f"   PMID: {result['pmid']}")
        print()
else:
    print(f"No results found for specialty: {specialty_filter}")

# Test 3: Batch search
batch_queries = [
    "What causes type 2 diabetes?",
    "How do vaccines work?",
    "What is atrial fibrillation?"
]

batch_results = batch_search(batch_queries, index, metadata, model, top_k=2)

for query, results in batch_results.items():
    print(f"\nQuery: {query}")
    print(f"Top result: {results[0]['title'][:80]}...")
    print(f"Similarity: {results[0]['similarity_score']:.4f}")

Testing Retrieval Functions
Query: How is diabetes managed with insulin therapy?

Rank 1. [diabetes] Score: 0.9464
   Title: Newer therapeutic approaches towards the management of diabetes mellitus: an update.
   PMID: 31663302

Rank 2. [diabetes] Score: 0.9413
   Title: Novel Approaches to Control Diabetes.
   PMID: 37559237

Rank 3. [diabetes] Score: 0.9357
   Title: A critical review on diabetes mellitus type 1 and type 2 management approaches: from lifestyle modification to current and novel targets and therapeutic agents.
   PMID: 39493778

Query: What are treatment options for heart failure?
Filter: cardiology

1. [cardiology] Score: 0.9224
   Title: Comprehensive review of the heart failure management guidelines presented by the American College of Cardiology and the current supporting evidence.
   PMID: 39054773

2. [cardiology] Score: 0.9109
   Title: MEMS Technology in Cardiology: Advancements and Applications in Heart Failure Management Focusing on the CardioMEMS Device.
   

In [34]:
print("Saving FAISS Index and Retrieval System")

# Creating output directory
vector_db_dir = 'vector_database'
os.makedirs(vector_db_dir, exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d')

# Saving FAISS index
index_file = f'{vector_db_dir}/medical_faiss_index_{timestamp}.index'
faiss.write_index(index, index_file)
print(f" FAISS index saved: {index_file}")

# Storing complete retrieval system
retrieval_system = {
    'index_file': index_file,
    'metadata': metadata,
    'model_name': model_name,
    'embedding_dim': embedding_dim,
    'num_documents': len(embeddings),
    'index_type': type(index).__name__,
    'creation_date': timestamp
}

system_file = f'{vector_db_dir}/retrieval_system_{timestamp}.pkl'
with open(system_file, 'wb') as f:
    pickle.dump(retrieval_system, f)
print(f" Retrieval system saved: {system_file}")

# Saving configurations
config_file = f'{vector_db_dir}/vector_db_config_{timestamp}.txt'
with open(config_file, 'w') as f:
    f.write("MEDICAL RAG SYSTEM - VECTOR DATABASE CONFIGURATION\n")
    f.write(f"Creation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Index Type: {type(index).__name__}\n")
    f.write(f"Model Name: {model_name}\n")
    f.write(f"Embedding Dimension: {embedding_dim}\n")
    f.write(f"Number of Documents: {len(embeddings)}\n")
    f.write(f"Vectors in Index: {index.ntotal}\n")
    f.write(f"\nSpecialty Distribution:\n")
    for specialty, count in metadata['specialty'].value_counts().items():
        f.write(f"  - {specialty}: {count} documents\n")

print(f" Configuration saved: {config_file}")

print("VECTOR DATABASE SETUP COMPLETE!")
print(f"\nOutput files in '{vector_db_dir}/' directory:")
print(f"  1. {index_file}")
print(f"  2. {system_file}")
print(f"  3. {config_file}")

Saving FAISS Index and Retrieval System
 FAISS index saved: vector_database/medical_faiss_index_20251102.index
 Retrieval system saved: vector_database/retrieval_system_20251102.pkl
 Configuration saved: vector_database/vector_db_config_20251102.txt
VECTOR DATABASE SETUP COMPLETE!

Output files in 'vector_database/' directory:
  1. vector_database/medical_faiss_index_20251102.index
  2. vector_database/retrieval_system_20251102.pkl
  3. vector_database/vector_db_config_20251102.txt


In [35]:
print("Creating Vector Database Utils")
vector_db_utils_code = '''
# vector_db_utils.py

import faiss
import pickle
import glob
import os
from sentence_transformers import SentenceTransformer

class MedicalVectorDB:
    """Medical Literature Vector Database with FAISS"""

    def __init__(self, vector_db_dir='vector_database'):
        """Initialize and load the vector database"""
        self.vector_db_dir = vector_db_dir
        self.index = None
        self.metadata = None
        self.model = None
        self.model_name = None

    def load(self):
        """Load the most recent vector database"""
        # Find most recent system file
        system_files = glob.glob(f'{self.vector_db_dir}/retrieval_system_*.pkl')

        if not system_files:
            raise FileNotFoundError(f"No retrieval system found in {self.vector_db_dir}/")

        latest_file = max(system_files, key=os.path.getctime)

        print(f"Loading retrieval system from: {latest_file}")

        with open(latest_file, 'rb') as f:
            system = pickle.load(f)

        # Load FAISS index
        self.index = faiss.read_index(system['index_file'])
        self.metadata = system['metadata']
        self.model_name = system['model_name']

        print(f"✓ Loaded {system['num_documents']} documents")
        print(f"✓ Index type: {system['index_type']}")

        # Load model
        print(f"Loading model: {self.model_name}...")
        self.model = SentenceTransformer(self.model_name)
        print("Model loaded")

        return self

    def search(self, query, top_k=5):
        """Search for relevant documents"""
        if self.index is None:
            raise ValueError("Database not loaded. Call load() first.")

        # Generate query embedding
        query_embedding = self.model.encode([query], normalize_embeddings=True)
        query_embedding = query_embedding.astype('float32')

        # Search
        distances, indices = self.index.search(query_embedding, top_k)

        # Format results
        results = []
        for rank, (idx, score) in enumerate(zip(indices[0], distances[0]), 1):
            result = {
                'rank': rank,
                'pmid': self.metadata.iloc[idx]['pmid'],
                'title': self.metadata.iloc[idx]['title'],
                'abstract': self.metadata.iloc[idx]['abstract'],
                'specialty': self.metadata.iloc[idx]['specialty'],
                'journal': self.metadata.iloc[idx]['journal'],
                'publication_date': self.metadata.iloc[idx]['publication_date'],
                'similarity_score': float(score)
            }
            results.append(result)

        return results

    def search_by_specialty(self, query, specialty, top_k=5):
        """Search filtered by specialty"""
        # Get more results for filtering
        initial_results = self.search(query, top_k=top_k*3)

        # Filter by specialty
        filtered = [r for r in initial_results if r['specialty'] == specialty]

        return filtered[:top_k]

    def get_document_by_pmid(self, pmid):
        """Retrieve document by PMID"""
        doc = self.metadata[self.metadata['pmid'] == pmid]

        if len(doc) == 0:
            return None

        return doc.iloc[0].to_dict()

    def get_statistics(self):
        """Get database statistics"""
        stats = {
            'total_documents': len(self.metadata),
            'specialties': self.metadata['specialty'].value_counts().to_dict(),
            'unique_journals': self.metadata['journal'].nunique(),
            'date_range': f"{self.metadata['publication_date'].min()} to {self.metadata['publication_date'].max()}"
        }
        return stats


def quick_search(query, top_k=5):
    """Quick search function - loads DB and searches in one call"""
    db = MedicalVectorDB()
    db.load()
    return db.search(query, top_k)
'''

with open('vector_db_utils.py', 'w') as f:
    f.write(vector_db_utils_code)

print(" Created vector_db_utils.py")

Creating Vector Database Utils
 Created vector_db_utils.py


In [36]:
print("COMPREHENSIVE VECTOR DATABASE TESTING")

# Testing performance
import time
test_queries_performance = [
    "What are the symptoms of diabetes?",
    "How is COVID-19 transmitted?",
    "What treatments exist for heart failure?",
    "What causes high blood pressure?",
    "How do antibiotics work?"
]

print(f"\nTesting search speed with {len(test_queries_performance)} queries")

start_time = time.time()

for query in test_queries_performance:
    results = search_medical_literature(query, index, metadata, model, top_k=5)

end_time = time.time()
total_time = end_time - start_time
avg_time = total_time / len(test_queries_performance)

print(f"\n Performance Results:")
print(f"  Total time: {total_time:.2f} seconds")
print(f"  Average time per query: {avg_time:.3f} seconds")
print(f"  Queries per second: {1/avg_time:.2f}")

# Accuracy test
relevance_queries = {
    "diabetes": "What is the role of insulin in diabetes management?",
    "cardiology": "What are risk factors for myocardial infarction?",
    "infectious_diseases": "How effective are COVID-19 vaccines?"
}

print("\nTesting specialty-specific retrieval accuracy\n")

for expected_specialty, query in relevance_queries.items():
    results = search_medical_literature(query, index, metadata, model, top_k=5)

    # Count how many results match expected specialty
    matching = sum(1 for r in results if r['specialty'] == expected_specialty)
    accuracy = (matching / len(results)) * 100

    print(f"Query: {query}")
    print(f"Expected specialty: {expected_specialty}")
    print(f"Results matching specialty: {matching}/5 ({accuracy:.0f}%)")
    print(f"Top result: {results[0]['title'][:60]}...")
    print(f"Similarity: {results[0]['similarity_score']:.4f}\n")

print("COMPLETED VECTOR DATABASE SETUP AND TESTING")

COMPREHENSIVE VECTOR DATABASE TESTING

Testing search speed with 5 queries

 Performance Results:
  Total time: 0.46 seconds
  Average time per query: 0.092 seconds
  Queries per second: 10.91

Testing specialty-specific retrieval accuracy

Query: What is the role of insulin in diabetes management?
Expected specialty: diabetes
Results matching specialty: 5/5 (100%)
Top result: Diabetes mellitus: an overview of the types, symptoms, compl...
Similarity: 0.9406

Query: What are risk factors for myocardial infarction?
Expected specialty: cardiology
Results matching specialty: 2/5 (40%)
Top result: Update on Preventive Cardiology....
Similarity: 0.9036

Query: How effective are COVID-19 vaccines?
Expected specialty: infectious_diseases
Results matching specialty: 3/5 (60%)
Top result: Why and How Vaccines Work....
Similarity: 0.9408

COMPLETED VECTOR DATABASE SETUP AND TESTING


## **Pipeline**

In [1]:
from vector_db_utils import MedicalVectorDB
from ragpipeline import Medical_RAGPipeline
import os



In [2]:
# Seting up API Key
import os
from getpass import getpass

gemini_api_key = getpass("Enter your Gemini API key: ")
os.environ['GEMINI_API_KEY'] = gemini_api_key

print("API key configured")


API key configured


In [3]:
#loading vector databases
vector_db = MedicalVectorDB(vector_db_dir='vector_database')
vector_db.load()

Loading retrieval system from: vector_database/retrieval_system_20251102.pkl
 Loaded 1954 documents
 Index type: IndexFlatIP
Loading model: pritamdeka/S-PubMedBert-MS-MARCO...
Model loaded


<vector_db_utils.MedicalVectorDB at 0x311da9090>

In [4]:
# Initializing RAG Pipeline

pipeline = Medical_RAGPipeline(
    vector_db=vector_db,
    api_key=gemini_api_key,
    model='gemini-2.5-flash',
    top_k=3,
    similarity_threshold=0.5
)

 Gemini model loaded: gemini-2.5-flash
 RAG Pipeline initialized with Gemini


In [5]:
test_query = "What are the early symptoms of type 2 diabetes mellitus?"

result = pipeline.query(test_query)

# Display result
pipeline.print_result(result)



-----> NO RISKS <-------
Query: What are the early symptoms of type 2 diabetes mellitus?
1. Retrieving documents
   Found 3 relevant documents
2. Formatting context
3. Generating answer with Gemini
 Answer generated
4. Extracting citations
  0 citations found

 Query #1 

Question:
What are the early symptoms of type 2 diabetes mellitus?

Answer:
The provided medical literature does not describe the early symptoms of type 2 diabetes mellitus. The documents discuss the definition, prevalence, pathophysiology, complications, and management of type 2 diabetes, but do not detail its initial clinical presentation or symptoms [PMID: 35192474, PMID: 31736194, PMID: 34319011].

Source Documents (3):

  1. Uncommon forms of diabetes....
     PMID: 35192474 | Similarity: 0.948

  2. Insights on the current status and advancement of diabetes mellitus type 2 and t...
     PMID: 31736194 | Similarity: 0.938

  3. Emerging Targets in Type 2 Diabetes and Diabetic Complications....
     PMID: 343190

In [7]:
test_query2 = " How is atrial fibrillation treated?"

result = pipeline.query(test_query2)

# Display result
pipeline.print_result(result)



-----> NO RISKS <-------
Query:  How is atrial fibrillation treated?
1. Retrieving documents
   Found 3 relevant documents
2. Formatting context
3. Generating answer with Gemini
 Answer generated
4. Extracting citations
  0 citations found

 Query #2 

Question:
 How is atrial fibrillation treated?

Answer:
Error generating answer: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.

Source Documents (3):

  1. The 2024 European Society of Cardiology Guidelines for Diagnosis and Management ...
     PMID: 39374908 | Similarity: 0.922

  2. Reprint of: Scientific statement from the French neurovascular and cardiac socie...
     PMID: 39510937 | Similarity: 0.911

  3. Design and deployment of the STEEER-AF trial to evaluate and improve guideline a...
     PMID: 38940494 | Similarity: 0.905

Done!!


In [6]:
# Generate trustworthiness report
trust_report = pipeline.trustworthiness_verifier.generate_trustworthiness_report(result)
print(trust_report)


TRUSTWORTHINESS REPORT

Citation Verification:
   Total Citations: 3
   Valid Citations: 0
   Citation Accuracy: 0.0%
   Invalid Citations: ['31736194', '35192474', '34319011']

Transparency Score: 60.0% (3/5)
Issues: Missing citations, No uncertainty indicators

Source Quality:
  • Number of sources: 3
  • Average similarity: 0.941
  • Specialties: diabetes

Reproducibility:  All sources verifiable via PMID



## **MISC**

In [None]:
'''
print("="*60)
print("MEDICAL RAG SYSTEM - DATA COLLECTION STAGE")
print("RISK MANAGEMENT & TRUSTWORTHINESS IMPLEMENTATION")
print("="*60)

# Load collected data
df = pd.read_csv('medical_literature_dataset.csv')

print(f"\nLoaded {len(df)} articles from PubMED\n")

# Step 1: Data Quality Validation
quality_report, quality_issues = validate_data_quality(df)

# Step 2: Data Cleaning
df_clean = clean_medical_data(df)

# Step 3: Bias Detection
bias_report, biases = detect_medical_bias(df_clean)

# Step 4: Privacy Compliance
privacy_findings, privacy_issues = check_privacy_compliance(df_clean)

# Step 5: Representativeness Analysis
rep_analysis = analyze_data_representativeness(df_clean)

# Step 6: Generate Comprehensive Report
generate_data_collection_report(
    df_clean,
    quality_report,
    bias_report,
    privacy_findings,
    rep_analysis
)

# Save final cleaned dataset
df_clean.to_csv('medical_literature_final.csv', index=False)
print("✓ Final cleaned dataset saved: medical_literature_final.csv")

print("\n" + "="*60)
print("RISK MANAGEMENT & TRUSTWORTHINESS IMPLEMENTATION COMPLETE")
print("="*60)'''

'\nprint("="*60)\nprint("MEDICAL RAG SYSTEM - DATA COLLECTION STAGE")\nprint("RISK MANAGEMENT & TRUSTWORTHINESS IMPLEMENTATION")\nprint("="*60)\n\n# Load collected data\ndf = pd.read_csv(\'medical_literature_dataset.csv\')\n\nprint(f"\nLoaded {len(df)} articles from PubMED\n")\n\n# Step 1: Data Quality Validation\nquality_report, quality_issues = validate_data_quality(df)\n\n# Step 2: Data Cleaning\ndf_clean = clean_medical_data(df)\n\n# Step 3: Bias Detection\nbias_report, biases = detect_medical_bias(df_clean)\n\n# Step 4: Privacy Compliance\nprivacy_findings, privacy_issues = check_privacy_compliance(df_clean)\n\n# Step 5: Representativeness Analysis\nrep_analysis = analyze_data_representativeness(df_clean)\n\n# Step 6: Generate Comprehensive Report\ngenerate_data_collection_report(\n    df_clean,\n    quality_report,\n    bias_report,\n    privacy_findings,\n    rep_analysis\n)\n\n# Save final cleaned dataset\ndf_clean.to_csv(\'medical_literature_final.csv\', index=False)\nprint("✓