# MR Literature Corpus Analysis

This notebook visualizes the temporal characteristics of the Mendelian
Randomization (MR) literature dataset used in MR-KG.

**Data sources:**
- PubMed metadata from `vector_store.db` (mr_pubmed_data table)
- Temporal statistics from summary statistics pipeline

**Coverage:** ~15,635 papers from the MR literature

## Setup

In [8]:
from pathlib import Path

import altair as alt
import duckdb
import pandas as pd
from yiutils.project_utils import find_project_root

# ---- Project paths ----
PROJECT_ROOT = find_project_root("docker-compose.yml")
DATA_DIR = PROJECT_ROOT / "data"
DB_DIR = DATA_DIR / "db"
PROCESSED_DIR = DATA_DIR / "processed"
OVERALL_STATS_DIR = PROCESSED_DIR / "overall-stats"

# ---- Altair configuration ----
alt.data_transformers.enable("default", max_rows=None)
alt.themes.enable("default")

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Database directory: {DB_DIR}")


Project root: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg
Data directory: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg/data
Database directory: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg/data/db


## Data Loading

In [10]:
# ---- Load temporal statistics from summary stats ----
temporal_stats_file = OVERALL_STATS_DIR / "temporal-statistics.csv"

if not temporal_stats_file.exists():
    raise FileNotFoundError(
        f"Temporal statistics file not found: {temporal_stats_file}\n"
        "Run 'just generate-all-summary-stats' in the processing directory first."
    )

temporal_stats = pd.read_csv(temporal_stats_file)
print(f"Loaded temporal statistics: {len(temporal_stats)} years")
print("\nFirst few rows:")
display(temporal_stats.head())

print("\nSummary statistics:")
display(temporal_stats.describe())


Loaded temporal statistics: 24 years

First few rows:


Unnamed: 0,publication_year,paper_count,cumulative_papers,model_results_count,unique_models_count
0,2003,2,2.0,10,5
1,2004,1,3.0,5,5
2,2005,11,14.0,48,6
3,2006,8,22.0,22,5
4,2007,16,38.0,60,6



Summary statistics:


Unnamed: 0,publication_year,paper_count,cumulative_papers,model_results_count,unique_models_count
count,24.0,24.0,24.0,24.0,24.0
mean,2014.5,651.458333,3135.833333,2100.083333,5.625
std,7.071068,1304.098022,5195.847863,3922.775259,0.875388
min,2003.0,1.0,2.0,5.0,2.0
25%,2008.75,16.75,81.25,60.0,5.75
50%,2014.5,107.5,530.5,380.5,6.0
75%,2020.25,614.5,3115.25,1836.25,6.0
max,2026.0,5462.0,15635.0,14622.0,6.0


In [4]:
# ---- Query vector store database directly ----
vector_db_path = DB_DIR / "vector_store.db"

if not vector_db_path.exists():
    raise FileNotFoundError(
        f"Vector store database not found: {vector_db_path}\n"
        "Ensure the database has been created via the processing pipeline."
    )

conn = duckdb.connect(str(vector_db_path), read_only=True)

query = """
SELECT 
    CAST(SUBSTR(pub_date, 1, 4) AS INTEGER) as year,
    COUNT(*) as paper_count
FROM mr_pubmed_data
WHERE pub_date IS NOT NULL 
    AND LENGTH(pub_date) >= 4
    AND CAST(SUBSTR(pub_date, 1, 4) AS INTEGER) BETWEEN 1990 AND 2025
GROUP BY year
ORDER BY year
"""

pub_year_data = conn.execute(query).df()
conn.close()

print(f"Loaded publication year data: {len(pub_year_data)} years")
print("\nFirst few rows:")
display(pub_year_data.head())

print("\nLast few rows:")
display(pub_year_data.tail())


Loaded publication year data: 23 years

First few rows:


Unnamed: 0,year,paper_count
0,2003,2
1,2004,1
2,2005,11
3,2006,8
4,2007,16



Last few rows:


Unnamed: 0,year,paper_count
18,2021,1161
19,2022,1649
20,2023,3653
21,2024,5462
22,2025,878


## Temporal Distribution - Bar Chart

In [19]:
# Ensure sorted and remove zeros for log scale
pub_year_data = (
    pub_year_data.sort_values("year").query("paper_count > 0").copy()
)

# Min/max for x-domain so last tick is 2025
min_year = int(pub_year_data["year"].min())

# Build log tick values like 1, 3, 10, 30, 100, 300, â€¦
ymax = float(pub_year_data["paper_count"].max())
log_ticks = []
k = 0
while (10**k) <= ymax * 1.2:
    base = 10**k
    for m in (1, 3):
        val = base * m
        if val >= 1:
            log_ticks.append(val)
    k += 1

base_chart = alt.Chart(pub_year_data).properties(width=800, height=420)

line_raw = base_chart.mark_line(point=True, color="#356bb3").encode(
    x=alt.X(
        "year:Q",
        title="Publication Year",
        scale=alt.Scale(domain=[min_year, 2025]),
        axis=alt.Axis(tickMinStep=1, format="d"),
    ),
    y=alt.Y(
        "paper_count:Q",
        title="Number of Papers (log scale)",
        scale=alt.Scale(type="log", base=10, nice=False, domainMin=1),
        axis=alt.Axis(values=log_ticks, format=","),
    ),
    tooltip=[
        alt.Tooltip("year:Q", title="Year", format="d"),
        alt.Tooltip("paper_count:Q", title="Papers", format=","),
    ],
)

# Labels for every point (with halo for readability)
labels_all = base_chart.mark_text(
    align="center",
    dy=26,
    # color="#356bb3",
    stroke="black",
    strokeWidth=1,
    fontSize=10,
).encode(
    x="year:Q",
    y="paper_count:Q",
    text=alt.Text("paper_count:Q", format=","),
)

# Enable x-axis zoom/pan (Altair 5: add_params)
xzoom = alt.selection_interval(bind="scales", encodings=["x"])

line_chart = (
    alt.layer(line_raw, labels_all)
    .add_params(xzoom)
    .properties(
        title="MR papers by publication year (log scale)",
    )
    .configure_axis(gridColor="#e6e6e6", gridOpacity=0.7)
)

line_chart


## Temporal Distribution - Cumulative Growth

In [5]:
# ---- Calculate cumulative count ----
pub_year_data_cumulative = pub_year_data.copy()
pub_year_data_cumulative["cumulative_count"] = pub_year_data_cumulative[
    "paper_count"
].cumsum()

print("Cumulative statistics:")
display(pub_year_data_cumulative.tail(10))


Cumulative statistics:


Unnamed: 0,year,paper_count,cumulative_count
13,2016,221,828
14,2017,260,1088
15,2018,369,1457
16,2019,545,2002
17,2020,823,2825
18,2021,1161,3986
19,2022,1649,5635
20,2023,3653,9288
21,2024,5462,14750
22,2025,878,15628


In [6]:
# ---- Create cumulative line chart ----
cumulative_chart = (
    alt.Chart(pub_year_data_cumulative)
    .mark_line(point=True, strokeWidth=2)
    .encode(
        x=alt.X(
            "year:T", title="Publication Year", axis=alt.Axis(format="%Y")
        ),
        y=alt.Y("cumulative_count:Q", title="Cumulative Paper Count"),
        tooltip=[
            alt.Tooltip("year:T", title="Year", format="%Y"),
            alt.Tooltip("cumulative_count:Q", title="Cumulative Papers"),
        ],
    )
    .properties(
        width=700,
        height=400,
        title="Cumulative Growth of MR Literature",
    )
)

cumulative_chart


## Summary Statistics

In [7]:
# ---- Calculate summary statistics ----
total_papers = pub_year_data["paper_count"].sum()
earliest_year = pub_year_data["year"].min()
latest_year = pub_year_data["year"].max()
year_span = latest_year - earliest_year + 1
avg_papers_per_year = total_papers / year_span

# Papers in last 5 years vs earlier
recent_cutoff = latest_year - 4
recent_papers = pub_year_data[pub_year_data["year"] >= recent_cutoff][
    "paper_count"
].sum()
earlier_papers = pub_year_data[pub_year_data["year"] < recent_cutoff][
    "paper_count"
].sum()

# ---- Display formatted summary ----
summary_data = {
    "Metric": [
        "Total Papers",
        "Year Range",
        "Year Span",
        "Average Papers per Year",
        f"Papers in Last 5 Years ({recent_cutoff}-{latest_year})",
        f"Papers Before {recent_cutoff}",
        "Percentage in Last 5 Years",
    ],
    "Value": [
        f"{total_papers:,}",
        f"{earliest_year} - {latest_year}",
        f"{year_span} years",
        f"{avg_papers_per_year:.1f}",
        f"{recent_papers:,}",
        f"{earlier_papers:,}",
        f"{(recent_papers / total_papers * 100):.1f}%",
    ],
}

summary_df = pd.DataFrame(summary_data)
display(summary_df)


Unnamed: 0,Metric,Value
0,Total Papers,15628
1,Year Range,2003 - 2025
2,Year Span,23 years
3,Average Papers per Year,679.5
4,Papers in Last 5 Years (2021-2025),12803
5,Papers Before 2021,2825
6,Percentage in Last 5 Years,81.9%


## Export Visualizations (Optional)

Uncomment and run the cells below to save plots for manuscript use.

In [22]:
# ---- Save line chart ----
output_dir = PROCESSED_DIR / "figures" / "literature"
output_dir.mkdir(parents=True, exist_ok=True)

line_chart.save(
    str(output_dir / "temporal_distribution_line.pdf"), scale_factor=2.0
)


  pdf = vlc.vegalite_to_pdf(


In [None]:
# # ---- Save cumulative chart ----
# cumulative_chart.save(str(output_dir / "temporal_distribution_cumulative.json"))
# print(f"Saved cumulative chart to {output_dir / 'temporal_distribution_cumulative.json'}")


## other topics

### sample period

In [10]:
# ---- Load MR PubMed data from JSON ----
import json

json_path = (
    PROJECT_ROOT / "data" / "raw" / "mr-pubmed-data" / "mr-pubmed-data.json"
)

if not json_path.exists():
    raise FileNotFoundError(f"JSON file not found: {json_path}")

with open(json_path, "r") as f:
    mr_data = json.load(f)

print(f"Loaded {len(mr_data)} papers from JSON")

# ---- Extract publication dates and find sample period ----
pub_dates = []
for paper in mr_data:
    if "pub_date" in paper and paper["pub_date"]:
        pub_dates.append(paper["pub_date"])

if pub_dates:
    # Convert to pandas datetime for easier manipulation
    dates_series = pd.to_datetime(pub_dates, errors="coerce")
    dates_series = dates_series.dropna()

    earliest_date = dates_series.min()
    latest_date = dates_series.max()

    earliest_month = earliest_date.strftime("%Y-%m")
    latest_month = latest_date.strftime("%Y-%m")

    print(f"\nSample Period:")
    print(f"  Earliest: {earliest_month}")
    print(f"  Latest: {latest_month}")
    print(f"  Total papers with dates: {len(dates_series):,}")
    print(f"\nDataset covers: {earliest_month} to {latest_month}")
else:
    print("No valid publication dates found")


Loaded 15635 papers from JSON

Sample Period:
  Earliest: 2003-04
  Latest: 2026-03
  Total papers with dates: 15,635

Dataset covers: 2003-04 to 2026-03


In [12]:
# ---- Count publications by year-month ----
# Extract year-month from dates
dates_with_ym = pd.DataFrame(
    {"pub_date": dates_series, "year_month": dates_series.strftime("%Y-%m")}
)

# Count papers by year-month
pub_by_month = (
    dates_with_ym["year_month"].value_counts().sort_index().reset_index()
)
pub_by_month.columns = ["year_month", "paper_count"]

print(f"Publications by month: {len(pub_by_month)} months")
print("\nFirst 10 months:")
display(pub_by_month.head(10))

print("\nLast 10 months:")
display(pub_by_month.tail(10))

print("\nSummary statistics:")
print(f"  Total months covered: {len(pub_by_month)}")
print(f"  Average papers per month: {pub_by_month['paper_count'].mean():.1f}")
print(f"  Median papers per month: {pub_by_month['paper_count'].median():.1f}")
print(f"  Max papers in a month: {pub_by_month['paper_count'].max()}")
print(
    f"  Month with most papers: {pub_by_month.loc[pub_by_month['paper_count'].idxmax(), 'year_month']}"
)


Publications by month: 239 months

First 10 months:


Unnamed: 0,year_month,paper_count
0,2003-04,1
1,2003-12,1
2,2004-08,1
3,2005-01,2
4,2005-02,1
5,2005-05,3
6,2005-08,2
7,2005-10,1
8,2005-11,1
9,2005-12,1



Last 10 months:


Unnamed: 0,year_month,paper_count
229,2025-05,75
230,2025-06,4
231,2025-07,1
232,2025-08,2
233,2025-09,1
234,2025-10,1
235,2025-11,4
236,2025-12,1
237,2026-01,4
238,2026-03,3



Summary statistics:
  Total months covered: 239
  Average papers per month: 65.4
  Median papers per month: 13.0
  Max papers in a month: 582
  Month with most papers: 2024-03


In [14]:
# ---- Filter publications after 2025-05 ----
cutoff_date = pd.Timestamp("2025-05-31")

recent_papers = []
for paper in mr_data:
    if "pub_date" in paper and paper["pub_date"]:
        pub_date = pd.to_datetime(paper["pub_date"], errors="coerce")
        if pd.notna(pub_date) and pub_date > cutoff_date:
            recent_papers.append(
                {
                    "pub_date": paper["pub_date"],
                    "pmid": paper.get("pmid", "N/A"),
                    "title": paper.get("title", "N/A"),
                }
            )

# Create DataFrame and sort by date
recent_df = pd.DataFrame(recent_papers)
if len(recent_df) > 0:
    recent_df = recent_df.sort_values("pub_date")

print(f"Found {len(recent_df)} publications after 2025-05")
print("\nPublications with pub_date after 2025-05:")
print("=" * 100)

for idx, row in recent_df.iterrows():
    print(f"\nDate: {row['pub_date']}")
    print(f"PMID: {row['pmid']}")
    print(f"Title: {row['title']}")
    print("-" * 100)


Found 21 publications after 2025-05

Publications with pub_date after 2025-05:

Date: 2025-06-06
PMID: 40188416
Title: Application of Human Genetics to Prioritize Coagulation Cascade Protein Targets for Ischemic Stroke Prevention.
----------------------------------------------------------------------------------------------------

Date: 2025-06-07
PMID: 38848523
Title: Genetically Informed Study Highlights Income-Independent Effect of Schizophrenia Liability on Mental and Physical Health.
----------------------------------------------------------------------------------------------------

Date: 2025-06-12
PMID: 38865108
Title: Plasma Proteomics of Exercise Blood Pressure and Incident Hypertension.
----------------------------------------------------------------------------------------------------

Date: 2025-06-13
PMID: 38869147
Title: Unraveling NEK4 as a Potential Drug Target in Schizophrenia and Bipolar I Disorder: A Proteomic and Genomic Approach.
----------------------------------

In [15]:
# ---- Filter publications from 2003 ----
start_date = pd.Timestamp("2003-01-01")
end_date = pd.Timestamp("2003-12-31")

papers_2003 = []
for paper in mr_data:
    if "pub_date" in paper and paper["pub_date"]:
        pub_date = pd.to_datetime(paper["pub_date"], errors="coerce")
        if pd.notna(pub_date) and start_date <= pub_date <= end_date:
            papers_2003.append(
                {
                    "pub_date": paper["pub_date"],
                    "pmid": paper.get("pmid", "N/A"),
                    "title": paper.get("title", "N/A"),
                }
            )

# Create DataFrame and sort by date
papers_2003_df = pd.DataFrame(papers_2003)
if len(papers_2003_df) > 0:
    papers_2003_df = papers_2003_df.sort_values("pub_date")

print(f"Found {len(papers_2003_df)} publications from 2003")
print("\nPublications with pub_date in 2003:")
print("=" * 100)

for idx, row in papers_2003_df.iterrows():
    print(f"\nDate: {row['pub_date']}")
    print(f"PMID: {row['pmid']}")
    print(f"Title: {row['title']}")
    print("-" * 100)


Found 2 publications from 2003

Publications with pub_date in 2003:

Date: 2003-04-12
PMID: 12689998
Title: 'Mendelian randomization': can genetic epidemiology contribute to understanding environmental determinants of disease?
----------------------------------------------------------------------------------------------------

Date: 2003-12-06
PMID: 14656017
Title: Hyperhomocysteinaemia as a risk factor for venous thrombosis: an update of the current evidence.
----------------------------------------------------------------------------------------------------
