# Publications

In [1]:
from Bio import Entrez
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('./pubmed_health_articles.csv')
Entrez.email = "myrzakunbekovis@gmail.com"

In [3]:
import time

HEALTH_TOPICS = {
    "health": "longevity OR aging OR healthy aging OR biohacking OR anti-aging",
}

# Date ranges for comparison
DATE_RANGES = {
     "10-year data": ("2015/01/01", "2025/07/11"),
#     "pandemic": ("2020/01/01", "2022/12/31"),
#     "post_pandemic": ("2023/01/01", "2025/06/21")
}

def fetch_pubmed_data(topic_name, query, date_range, max_results=500):
    """Fetches PubMed articles with metadata and abstracts"""
    # search_term = f"({query}) AND {date_range[0]}:{date_range[1]}[PDAT]" #we can set the conditions in search term

    search_term  = f"({query}) AND {date_range[0]}:{date_range[1]}[PDAT]"
    
    # Step 1: Search PubMed
    # handles stores the raw XML data stream
    handle = Entrez.esearch(db="pubmed",
                            term=search_term,
                            retmax=max_results,
                            sort="relevance",  # Get most representative articles
                            usehistory="y") #saves the search on PubMed’s history server, so you can fetch results in pages.
    search_results = Entrez.read(handle) #reads and parses the search results
    handle.close()

    #metadata about the search
    #needed later to retreive search results without rerunning the code
    webenv = search_results["WebEnv"] #an identifier for my search session on pubMed's server
    query_key = search_results["QueryKey"] #a reference to specific query within session
    total = int(search_results["Count"]) #total number of results matching my query
    
    # print(f"Found {total} articles for {topic_name} ({date_range[0]} to {date_range[1]})")

    print(f"Found {total} articles for {topic_name}")

    # Step 2: Batch fetching (NCBI recommends batches of 100)
    batch_size = 100
    articles = []

    #loop runs from 0 up to the minimum between what the search found and the max number of results we wanted
    #tqdm is needed for graphical bar progress
    #the loop runs by fetching by 100 articles per one loop run
    for start in tqdm(range(0, min(total, max_results), batch_size)): 
        end = min(total, start + batch_size)
        handle = Entrez.efetch(db="pubmed",
                               retstart=start,
                               retmax=batch_size,
                               webenv=webenv,
                               query_key=query_key,
                               retmode="xml")
        data = Entrez.read(handle)
        articles += parse_articles(data)
        handle.close()
        time.sleep(0.3)  # Avoid overloading server
    
    return articles

def parse_articles(data):
    """Extracts key information from PubMed XML"""
    parsed = []
    for article in data['PubmedArticle']:
        try:
            pmid = article['MedlineCitation']['PMID'].strip()
            title = article['MedlineCitation']['Article']['ArticleTitle'].strip()
            
            # Abstract handling (some articles lack abstracts)
            abstract = ""
            if 'Abstract' in article['MedlineCitation']['Article']:
                abstract_parts = article['MedlineCitation']['Article']['Abstract']['AbstractText']
                abstract = " ".join([text for text in abstract_parts if isinstance(text, str)])
            
            # Journal and date info
            journal = article['MedlineCitation']['Article']['Journal']['Title']
            pub_date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
            year = pub_date.get('Year', '')
            
            # MeSH terms for topic validation
            #Medical subject headings assigned to this article
            mesh_terms = []
            if 'MeshHeadingList' in article['MedlineCitation']:
                for item in article['MedlineCitation']['MeshHeadingList']:
                    mesh_terms.append(item['DescriptorName'])
            
            parsed.append({
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "journal": journal,
                "year": year,
                "pub_date": str(pub_date),
                "mesh_terms": "; ".join(mesh_terms),
                "source": "PubMed"
            })
        except KeyError as e:
            continue
    
    return parsed

# Collect data for all combinations
all_data = []

for topic_name, query in HEALTH_TOPICS.items():
    for period, date_range in DATE_RANGES.items():
        articles = fetch_pubmed_data(f"{topic_name}_{period}", 
                                     query, 
                                     date_range,
                                     max_results=500)
        for article in articles:
            article["topic"] = topic_name
            article["period"] = period
        all_data.extend(articles)

# Convert to DataFrame and save
df = pd.DataFrame(all_data)
df.to_csv("pubmed_health_articles_500.csv", index=False)
print(f"Saved {len(df)} articles with {len(df) - df['abstract'].isna().sum()} abstracts")

Found 377010 articles for health_10-year data


100%|█████████████████████████████████████████████| 5/5 [00:38<00:00,  7.66s/it]

Saved 500 articles with 500 abstracts





In [None]:
data = {
    'PubmedArticle': [
        {
            'MedlineCitation': {
                'PMID': '12345678',
                'Article': {
                    'ArticleTitle': 'Example title',
                    'Abstract': {
                        'AbstractText': ['This is the abstract text.']
                    },
                    'Journal': {
                        'Title': 'Example Journal',
                        'JournalIssue': {
                            'PubDate': {
                                'Year': '2022',
                                'Month': 'Nov',
                                'Day': '15'
                            }
                        }
                    }
                },
                'MeshHeadingList': [
                    {'DescriptorName': 'Vaccination'},
                    {'DescriptorName': 'Public Health'}
                ]
            },
            'PubmedData': {
                
            }
        },
        ...
    ]
}

In [2]:
df = df.drop(columns = ['topic', 'period'])

In [4]:
df.to_csv('./pubmed_health_articles.csv', index=False)

# News

In [6]:
"""
nyt_longevity_scraper.py
------------------------
NYT Archive API pull (2010‑present) filtered for longevity / healthy‑aging
keywords.   Requires: python-dotenv, requests, pandas, tqdm.
"""
import os, re, time, datetime as dt, requests, pandas as pd
from dotenv import load_dotenv                         # pip install python-dotenv
from tqdm import tqdm                                  # pip install tqdm

# ── 0.  CONFIG ────────────────────────────────────────────────────────────────
API_KEY = "9ivauV5adGK29eJ5Tq9q0ImuQ75HGnt2"                 # put your key in .env file
if not API_KEY:
    raise RuntimeError("Set NYT_API_KEY in your .env file")

BASE  = "https://api.nytimes.com/svc"
OUT_DIR = "nyt_monthly_csv"                            # where monthly files live
os.makedirs(OUT_DIR, exist_ok=True)

# keyword regex (add more if desired)
KEYWORDS = re.compile(
    r"\b(longevity|healthy[ -]ag(?:ing|e?ing)|biohacking|anti[ -]ag(?:ing|e?ing))\b",
    flags=re.I
)

session = requests.Session()
session.params = {"api-key": API_KEY}                  # append key automatically


# ── 1.  HELPERS ───────────────────────────────────────────────────────────────
def archive_month(year: int, month: int) -> pd.DataFrame:
    """Fetch one month; return *raw* DataFrame."""
    url = f"{BASE}/archive/v1/{year}/{month}.json"
    r = session.get(url, timeout=60)
    r.raise_for_status()
    docs = r.json()["response"]["docs"]
    return pd.json_normalize(docs)

def filter_keywords(df: pd.DataFrame) -> pd.DataFrame:
    """Keep rows whose text fields match KEYWORDS regex."""
    text = (
        df["abstract"].fillna("") + " " +
        df["lead_paragraph"].fillna("") + " " +
        df["snippet"].fillna("")
    )
    return df[text.str.contains(KEYWORDS, regex=True)]

def month_file(year: int, month: int) -> str:
    """Path for monthly CSV."""
    return f"{OUT_DIR}/nyt_{year}_{month:02d}.csv"


# ── 2.  MAIN LOOP ─────────────────────────────────────────────────────────────
def collect_nyt_aging(start_year: int = 2010):
    today = dt.datetime.today()
    months_total = (today.year - start_year) * 12 + today.month
    progress = tqdm(total=months_total, desc="NYT months")

    for year in range(start_year, today.year + 1):
        max_month = today.month if year == today.year else 12
        for month in range(1, max_month + 1):
            progress.update(1)
            fn = month_file(year, month)
            if os.path.exists(fn):                     # resume support
                continue

            try:
                df_raw  = archive_month(year, month)
                df_keep = filter_keywords(df_raw)

                if not df_keep.empty:
                    df_keep.to_csv(fn, index=False)
            except requests.HTTPError as e:
                print(f"HTTP error {year}-{month:02d}: {e}")
            except Exception as ex:
                print(f"Other error {year}-{month:02d}: {ex}")

            time.sleep(12.5)  # 5 calls/min ⇒ 12 s per call keeps us safe

    progress.close()


# ── 3.  CONCAT AFTER RUN ─────────────────────────────────────────────────────
def concat_all() -> pd.DataFrame:
    frames = [
        pd.read_csv(os.path.join(OUT_DIR, f))
        for f in os.listdir(OUT_DIR)
        if f.endswith(".csv")
    ]
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()


# ── 4.  RUN ───────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    collect_nyt_aging(start_year=2010)
    df_all = concat_all()
    df_all.to_csv("nyt_longevity_2010‑present.csv", index=False)
    print(f"✔  Saved {len(df_all):,} filtered NYT articles to nyt_longevity_2010‑present.csv")


  return df[text.str.contains(KEYWORDS, regex=True)]
NYT months:  99%|██████████████████████████▊| 186/187 [1:32:44<00:27, 27.38s/it]

HTTP error 2025-06: 403 Client Error: Forbidden for url: https://api.nytimes.com/svc/archive/v1/2025/6.json?api-key=9ivauV5adGK29eJ5Tq9q0ImuQ75HGnt2


NYT months: 100%|███████████████████████████| 187/187 [1:32:57<00:00, 22.99s/it]

HTTP error 2025-07: 403 Client Error: Forbidden for url: https://api.nytimes.com/svc/archive/v1/2025/7.json?api-key=9ivauV5adGK29eJ5Tq9q0ImuQ75HGnt2


NYT months: 100%|███████████████████████████| 187/187 [1:33:10<00:00, 29.89s/it]


✔  Saved 386 filtered NYT articles to nyt_longevity_2010‑present.csv
