# PubMed Abstract Downloader

This notebook retrieves abstracts for the top 1000 most-cited GWAS studies based on the number of associations, using the Entrez API.


## 1. Load GWAS summary data and select top PMIDs


In [None]:
import pandas as pd 
import os 

# Make sure to be in the right directory
print(os.getcwd())
print(os.chdir(r".."))
print(os.getcwd())

df=pd.read_csv(r"data/significant_associations.csv")

# Filter for the top 500 studies 
top_pmids = (
    df["PUBMEDID"]
    .dropna()
    .value_counts()
    .head(1000)
    .index
    .astype(int)
    .tolist()
)


  df=pd.read_csv(r"data/significant_associations.csv")


## 2. Download abstracts from PubMed using Entrez


In [None]:
from Bio import Entrez
import time
import json

# Entrez API configuration
Entrez.email =  "" # Replace if needed
Entrez.api_key = ""

# Function to fetch one abstract
def fetch_abstract(pmid):
    try:
        handle = Entrez.efetch(db="pubmed", id=str(pmid), rettype="abstract", retmode="text")
        abstract = handle.read()
        return abstract.strip()
    except Exception as e:
        print(f"❌ PMID {pmid} failed: {e}")
        return None


## 3. Loop through PMIDs and fetch abstracts


In [None]:
abstracts = {}
for i, pmid in enumerate(top_pmids):
    print(f"🔄 Fetching PMID {pmid} ({i+1}/{len(top_pmids)})")
    abs_text = fetch_abstract(pmid)
    if abs_text:
        abstracts[pmid] = abs_text
    time.sleep(0.1)  # Be nice to the API, even with a key

print(f"✅ Retrieved {len(abstracts)} abstracts.")


## 4. Save abstract data as JSON


In [None]:
output_file = "data/raw/pubmed_abstracts_top1000.json"

with open(output_file, "w") as f:
    json.dump(abstracts, f, indent=2)

print(f"✅ Abstracts saved to: {output_file}")
