# MR Literature Corpus Analysis

This notebook visualizes the temporal characteristics of the Mendelian
Randomization (MR) literature dataset used in MR-KG.

**Data sources:**
- PubMed metadata from `vector_store.db` (mr_pubmed_data table)
- Temporal statistics from summary statistics pipeline

**Coverage:** ~15,635 papers from the MR literature

## Setup

In [None]:
from pathlib import Path

import altair as alt
import duckdb
import pandas as pd
from yiutils.project_utils import find_project_root

# ---- Project paths ----
PROJECT_ROOT = find_project_root("docker-compose.yml")
DATA_DIR = PROJECT_ROOT / "data"
DB_DIR = DATA_DIR / "db"
PROCESSED_DIR = DATA_DIR / "processed"
OVERALL_STATS_DIR = PROCESSED_DIR / "overall-stats"

# ---- Altair configuration ----
alt.data_transformers.enable("default", max_rows=None)
alt.themes.enable("default")

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Database directory: {DB_DIR}")

## Data Loading

In [None]:
# ---- Load temporal statistics from summary stats ----
temporal_stats_file = OVERALL_STATS_DIR / "temporal-statistics.csv"

if not temporal_stats_file.exists():
    raise FileNotFoundError(
        f"Temporal statistics file not found: {temporal_stats_file}\n"
        "Run 'just generate-all-summary-stats' in the processing directory first."
    )

temporal_stats = pd.read_csv(temporal_stats_file)
print(f"Loaded temporal statistics: {len(temporal_stats)} years")
print("\nFirst few rows:")
display(temporal_stats.head())

print("\nSummary statistics:")
display(temporal_stats.describe())

In [None]:
# ---- Query vector store database directly ----
vector_db_path = DB_DIR / "vector_store.db"

if not vector_db_path.exists():
    raise FileNotFoundError(
        f"Vector store database not found: {vector_db_path}\n"
        "Ensure the database has been created via the processing pipeline."
    )

conn = duckdb.connect(str(vector_db_path), read_only=True)

query = """
SELECT 
    CAST(SUBSTR(pub_date, 1, 4) AS INTEGER) as year,
    COUNT(*) as paper_count
FROM mr_pubmed_data
WHERE pub_date IS NOT NULL 
    AND LENGTH(pub_date) >= 4
    AND CAST(SUBSTR(pub_date, 1, 4) AS INTEGER) BETWEEN 1990 AND 2025
GROUP BY year
ORDER BY year
"""

pub_year_data = conn.execute(query).df()
conn.close()

print(f"Loaded publication year data: {len(pub_year_data)} years")
print("\nFirst few rows:")
display(pub_year_data.head())

print("\nLast few rows:")
display(pub_year_data.tail())

## Temporal Distribution - Bar Chart

In [None]:
bar_chart = (
    alt.Chart(pub_year_data)
    .mark_bar()
    .encode(
        x=alt.X("year:T", title="Publication Year", axis=alt.Axis(format="%Y")),
        y=alt.Y("paper_count:Q", title="Number of Papers"),
        color=alt.Color(
            "paper_count:Q",
            scale=alt.Scale(scheme="viridis"),
            legend=None,
        ),
        tooltip=[
            alt.Tooltip("year:T", title="Year", format="%Y"),
            alt.Tooltip("paper_count:Q", title="Papers"),
        ],
    )
    .properties(
        width=700,
        height=400,
        title="MR Papers by Publication Year",
    )
)

bar_chart

## Temporal Distribution - Cumulative Growth

In [None]:
# ---- Calculate cumulative count ----
pub_year_data_cumulative = pub_year_data.copy()
pub_year_data_cumulative["cumulative_count"] = (
    pub_year_data_cumulative["paper_count"].cumsum()
)

print("Cumulative statistics:")
display(pub_year_data_cumulative.tail(10))

In [None]:
# ---- Create cumulative line chart ----
cumulative_chart = (
    alt.Chart(pub_year_data_cumulative)
    .mark_line(point=True, strokeWidth=2)
    .encode(
        x=alt.X("year:T", title="Publication Year", axis=alt.Axis(format="%Y")),
        y=alt.Y("cumulative_count:Q", title="Cumulative Paper Count"),
        tooltip=[
            alt.Tooltip("year:T", title="Year", format="%Y"),
            alt.Tooltip("cumulative_count:Q", title="Cumulative Papers"),
        ],
    )
    .properties(
        width=700,
        height=400,
        title="Cumulative Growth of MR Literature",
    )
)

cumulative_chart

## Summary Statistics

In [None]:
# ---- Calculate summary statistics ----
total_papers = pub_year_data["paper_count"].sum()
earliest_year = pub_year_data["year"].min()
latest_year = pub_year_data["year"].max()
year_span = latest_year - earliest_year + 1
avg_papers_per_year = total_papers / year_span

# Papers in last 5 years vs earlier
recent_cutoff = latest_year - 4
recent_papers = pub_year_data[
    pub_year_data["year"] >= recent_cutoff
]["paper_count"].sum()
earlier_papers = pub_year_data[
    pub_year_data["year"] < recent_cutoff
]["paper_count"].sum()

# ---- Display formatted summary ----
summary_data = {
    "Metric": [
        "Total Papers",
        "Year Range",
        "Year Span",
        "Average Papers per Year",
        f"Papers in Last 5 Years ({recent_cutoff}-{latest_year})",
        f"Papers Before {recent_cutoff}",
        "Percentage in Last 5 Years",
    ],
    "Value": [
        f"{total_papers:,}",
        f"{earliest_year} - {latest_year}",
        f"{year_span} years",
        f"{avg_papers_per_year:.1f}",
        f"{recent_papers:,}",
        f"{earlier_papers:,}",
        f"{(recent_papers / total_papers * 100):.1f}%",
    ],
}

summary_df = pd.DataFrame(summary_data)
display(summary_df)

## Export Visualizations (Optional)

Uncomment and run the cells below to save plots for manuscript use.

In [None]:
# # ---- Save bar chart ----
# output_dir = PROCESSED_DIR / "figures" / "literature"
# output_dir.mkdir(parents=True, exist_ok=True)

# bar_chart.save(str(output_dir / "temporal_distribution_bar.json"))
# print(f"Saved bar chart to {output_dir / 'temporal_distribution_bar.json'}")

# # Note: PNG export requires additional dependencies (altair_saver)
# # bar_chart.save(str(output_dir / "temporal_distribution_bar.png"), scale_factor=2.0)

In [None]:
# # ---- Save cumulative chart ----
# cumulative_chart.save(str(output_dir / "temporal_distribution_cumulative.json"))
# print(f"Saved cumulative chart to {output_dir / 'temporal_distribution_cumulative.json'}")