<a href="https://colab.research.google.com/github/Ismail-therap/Agentic-AI-application-Validate-PubMed-Research/blob/main/Agentic_AI_application_in_Medical_Field_research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project title:

Agentic LLM generates hypotheses on how air pollutants or diet may influence depression, and validates them using NHANES data.

# PubMed Central (PMC) Open Access Subset : To generate the hypothesis

In [2]:
!pip install biopython

from Bio import Entrez
import pandas as pd
import time

# Step 1: Set your email (required by NCBI)
Entrez.email = "statistician71@gmail.com"

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
# Step 2: Define search parameters
def search_pubmed(query, max_results=10):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    return record["IdList"]

In [4]:
# Step 3: Fetch article metadata
def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="xml")
    records = Entrez.read(handle)

    paper_data = []
    for article in records['PubmedArticle']:
        try:
            title = article['MedlineCitation']['Article']['ArticleTitle']
            abstract = article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
            journal = article['MedlineCitation']['Article']['Journal']['Title']
            year = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate'].get('Year', 'N/A')
        except:
            continue

        paper_data.append({
            "Title": title,
            "Abstract": abstract,
            "Journal": journal,
            "Year": year
        })

    return pd.DataFrame(paper_data)

In [5]:
# Step 4: Run pipeline
def get_pubmed_papers(query="depression AND environmental exposure", max_results=20):
    ids = search_pubmed(query, max_results)
    time.sleep(1)  # to respect NCBI API rate limits
    return fetch_details(ids)

In [8]:

# Example run
df = get_pubmed_papers(query="depression AND PM2.5", max_results=1000)
print(df.head())

                                               Title  \
0  Depressed nestling growth during exposure to s...   
1  Frailty mediated the associations of fine part...   
2  PM2.5 Exposure Triggers Hypothalamic Oxidative...   
3  Research Progress on the Correlation Between A...   
4  Ecological analysis of air particulate matter ...   

                                            Abstract  \
0  Human and animal populations increasingly enco...   
1  The role of frailty in the associations of fin...   
2  Epidemiological studies have linked fine dust ...   
3  Autism spectrum disorder (ASD) is a neurodevel...   
4  Adolescent depression is a health issue influe...   

                                             Journal  Year  
0                                 Scientific reports  2025  
1  The journals of gerontology. Series A, Biologi...  2025  
2        International journal of molecular sciences  2024  
3                Journal of applied toxicology : JAT  2024  
4                    

## To do:

Use a LLM model to generate hypotheses on how air pollutants or diet may influence depression.

(97, 4)

# Validate the hypothesis using NHANES Data

In [12]:
!pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Downloading pyreadstat-1.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.2.8


In [14]:
import pandas as pd
import pyreadstat  # or use pandas' read_sas

# File paths (after downloading)
demo_file = "DEMO_L.xpt"  # demographics
#dpq_file = "DPQ_J.XPT"    # depression screener (PHQ-9)

# Load data, specifying 'latin1' encoding
demo_df, _ = pyreadstat.read_xport(demo_file, encoding='latin1')
#dpq_df, _ = pyreadstat.read_xport(dpq_file, encoding='latin1') # Consider using 'latin1' for this file as well

# Preview
print(demo_df.head())
#print(dpq_df.head())

       SEQN  SDDSRVYR  RIDSTATR  RIAGENDR  RIDAGEYR  RIDAGEMN  RIDRETH1  \
0  130378.0      12.0       2.0       1.0      43.0       NaN       5.0   
1  130379.0      12.0       2.0       1.0      66.0       NaN       3.0   
2  130380.0      12.0       2.0       2.0      44.0       NaN       2.0   
3  130381.0      12.0       2.0       2.0       5.0       NaN       5.0   
4  130382.0      12.0       2.0       1.0       2.0       NaN       3.0   

   RIDRETH3  RIDEXMON  RIDEXAGM  ...  DMDHRGND  DMDHRAGZ  DMDHREDZ  DMDHRMAZ  \
0       6.0       2.0       NaN  ...       NaN       NaN       NaN       NaN   
1       3.0       2.0       NaN  ...       NaN       NaN       NaN       NaN   
2       2.0       1.0       NaN  ...       NaN       NaN       NaN       NaN   
3       7.0       1.0      71.0  ...       2.0       2.0       2.0       3.0   
4       3.0       2.0      34.0  ...       2.0       2.0       3.0       1.0   

   DMDHSEDZ      WTINT2YR      WTMEC2YR  SDMVSTRA  SDMVPSU  INDFMPIR