In [None]:
"""
Pipeline for biomedical research analysis using NIH RePORTER / ExPORTER data.

Goals:
- Load NIH project (grant) data from ExPORTER/REPORTER CSVs.
- Filter to a biomedical topic (e.g., asthma, HIV, cancer).
- Build summary tables: funding trends, IC distribution, mechanisms.
- (Optional) Link to publications for bibliometrics / NLP.
- Export processed datasets ready for further analysis or dashboards.

Usage (examples):
    python pipeline_nih_biomedical.py --fy-start 2019 --fy-end 2024 --keyword asthma
    python pipeline_nih_biomedical.py --fy-start 2015 --fy-end 2020 --rcdc "Asthma"

Assumptions:
- You have downloaded NIH ExPORTER / RePORTER project CSVs into data/raw.
- Filenames look like: RePORTER_PRJ_C_FY2019.csv, etc.
"""

from __future__ import annotations

import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

import pandas as pd


# ---------------------------------------------------------------------
# Minimal config (works even if you don't have a separate config.py)
# ---------------------------------------------------------------------
try:
    from config import RAW_DATA_DIR, PROCESSED_DATA_DIR  # type: ignore
except ImportError:
    BASE_DIR = Path(__file__).resolve().parent
    RAW_DATA_DIR = BASE_DIR / "data" / "raw"
    PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)


# ---------------------------------------------------------------------
# Config dataclass
# ---------------------------------------------------------------------
@dataclass
class PipelineConfig:
    fy_start: int
    fy_end: int
    keyword: Optional[str] = None
    rcdc_term: Optional[str] = None
    exporter_prefix: str = "RePORTER_PRJ_C_FY"  # adjust if names differ
    project_id_col: str = "PROJECT_NUMBER"
    abstract_col: str = "ABSTRACT_TEXT"
    title_col: str = "PROJECT_TITLE"
    rcdc_col: str = "PROJECT_TERMS"  # or "RCDC" depending on file
    out_prefix: str = "nih_biomed"


# ---------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------
def find_exporter_files(cfg: PipelineConfig) -> List[Path]:
    """
    Locate NIH RePORTER/ExPORTER project CSVs for the FY range.
    Assumes filenames contain FY year like 'RePORTER_PRJ_C_FY2019.csv'.
    """
    files: List[Path] = []
    for fy in range(cfg.fy_start, cfg.fy_end + 1):
        pattern = f"{cfg.exporter_prefix}{fy}"
        options = list(RAW_DATA_DIR.glob(f"{pattern}*.csv"))
        if not options:
            print(f"[WARN] No file found for FY {fy} matching {pattern}*.csv")
        else:
            files.extend(options)
    return files


def load_nih_projects(cfg: PipelineConfig) -> pd.DataFrame:
    """
    Load and concatenate NIH project-level CSVs for the requested FY range.
    """
    files = find_exporter_files(cfg)
    if not files:
        raise FileNotFoundError(
            f"No NIH project files found in {RAW_DATA_DIR} for FY {cfg.fy_start}-{cfg.fy_end}.\n"
            "Download RePORTER/ExPORTER project files from NIH and place them in data/raw."
        )

    dfs = []
    for f in files:
        print(f"[INFO] Reading {f.name}")
        df = pd.read_csv(f, dtype=str, low_memory=False)
        dfs.append(df)

    projects = pd.concat(dfs, ignore_index=True)
    print(f"[INFO] Loaded {len(projects):,} project rows from {len(files)} file(s).")
    return projects


# ---------------------------------------------------------------------
# Filtering & text processing
# ---------------------------------------------------------------------
def filter_by_fy(projects: pd.DataFrame, cfg: PipelineConfig) -> pd.DataFrame:
    """
    Filter projects by fiscal year (if present in the data).
    ExPORTER often uses 'FY' or 'FISCAL_YEAR' column.
    """
    fy_cols = [c for c in projects.columns if c.upper() in {"FY", "FISCAL_YEAR"}]
    if not fy_cols:
        print("[WARN] No FY/FISCAL_YEAR column found; skipping FY filter.")
        return projects

    fy_col = fy_cols[0]
    projects[fy_col] = pd.to_numeric(projects[fy_col], errors="coerce")
    mask = (projects[fy_col] >= cfg.fy_start) & (projects[fy_col] <= cfg.fy_end)
    filtered = projects.loc[mask].copy()
    print(f"[INFO] Filtered to FY {cfg.fy_start}-{cfg.fy_end}: {len(filtered):,} rows remain.")
    return filtered


def filter_by_keyword(projects: pd.DataFrame, cfg: PipelineConfig) -> pd.DataFrame:
    """
    Filter based on a keyword in project title or abstract (case-insensitive).
    """
    if not cfg.keyword:
        return projects

    kw = cfg.keyword.lower()
    title_col = cfg.title_col if cfg.title_col in projects.columns else None
    abstract_col = cfg.abstract_col if cfg.abstract_col in projects.columns else None

    if not title_col and not abstract_col:
        print("[WARN] No title/abstract columns found; skipping keyword filter.")
        return projects

    def text_contains(s: pd.Series) -> pd.Series:
        return s.fillna("").str.lower().str.contains(kw, na=False)

    mask = pd.Series(False, index=projects.index)
    if title_col:
        mask |= text_contains(projects[title_col])
    if abstract_col:
        mask |= text_contains(projects[abstract_col])

    filtered = projects.loc[mask].copy()
    print(f"[INFO] Filtered by keyword '{cfg.keyword}': {len(filtered):,} rows remain.")
    return filtered


def filter_by_rcdc(projects: pd.DataFrame, cfg: PipelineConfig) -> pd.DataFrame:
    """
    Filter based on RCDC or term column (depending on how your ExPORTER is structured).
    """
    if not cfg.rcdc_term:
        return projects

    col = cfg.rcdc_col if cfg.rcdc_col in projects.columns else None
    if not col:
        print(f"[WARN] RCDC/term column '{cfg.rcdc_col}' not found; skipping RCDC filter.")
        return projects

    term = cfg.rcdc_term.lower()
    mask = projects[col].fillna("").str.lower().str.contains(term, na=False)
    filtered = projects.loc[mask].copy()
    print(f"[INFO] Filtered by RCDC term '{cfg.rcdc_term}': {len(filtered):,} rows remain.")
    return filtered


# ---------------------------------------------------------------------
# Summary tables
# ---------------------------------------------------------------------
def summarize_by_fy(projects: pd.DataFrame) -> pd.DataFrame:
    """
    Funding trends by fiscal year.
    Assumes a 'TOTAL_COST' or similar; adjust as needed for your ExPORTER schema.
    """
    df = projects.copy()
    fy_cols = [c for c in df.columns if c.upper() in {"FY", "FISCAL_YEAR"}]
    if not fy_cols:
        print("[WARN] No FY column for summary; returning empty frame.")
        return pd.DataFrame()

    fy_col = fy_cols[0]
    df[fy_col] = pd.to_numeric(df[fy_col], errors="coerce")

    funding_cols = [c for c in df.columns if "TOTAL_COST" in c.upper()]
    if funding_cols:
        fund_col = funding_cols[0]
        df[fund_col] = pd.to_numeric(df[fund_col], errors="coerce")
        agg = df.groupby(fy_col)[fund_col].sum().reset_index()
        agg.rename(columns={fund_col: "total_cost"}, inplace=True)
    else:
        agg = df.groupby(fy_col)[df.columns[0]].count().reset_index()
        agg.rename(columns={df.columns[0]: "project_count"}, inplace=True)

    return agg.sort_values(by=fy_col)


def summarize_by_ic(projects: pd.DataFrame) -> pd.DataFrame:
    """
    Summary by NIH Institute/Center.
    Assumes an 'IC_NAME' or 'IC' column.
    """
    df = projects.copy()
    ic_cols = [c for c in df.columns if c.upper() in {"IC_NAME", "IC", "ADMIN_IC"}]
    if not ic_cols:
        print("[WARN] No IC column found for summary; returning empty frame.")
        return pd.DataFrame()

    ic_col = ic_cols[0]
    funding_cols = [c for c in df.columns if "TOTAL_COST" in c.upper()]
    if funding_cols:
        fund_col = funding_cols[0]
        df[fund_col] = pd.to_numeric(df[fund_col], errors="coerce")
        agg = df.groupby(ic_col)[fund_col].sum().reset_index()
        agg.rename(columns={fund_col: "total_cost"}, inplace=True)
    else:
        agg = df.groupby(ic_col)[df.columns[0]].count().reset_index()
        agg.rename(columns={df.columns[0]: "project_count"}, inplace=True)

    return agg.sort_values(by="total_cost" if "total_cost" in agg.columns else "project_count", ascending=False)


def summarize_by_mechanism(projects: pd.DataFrame) -> pd.DataFrame:
    """
    Summary by activity code / mechanism (e.g., R01, U01, K23).
    """
    df = projects.copy()
    act_cols = [c for c in df.columns if c.upper() in {"ACTIVITY", "ACTIVITY_CODE"}]
    if not act_cols:
        print("[WARN] No activity code column found; returning empty frame.")
        return pd.DataFrame()

    act_col = act_cols[0]
    funding_cols = [c for c in df.columns if "TOTAL_COST" in c.upper()]
    if funding_cols:
        fund_col = funding_cols[0]
        df[fund_col] = pd.to_numeric(df[fund_col], errors="coerce")
        agg = df.groupby(act_col)[fund_col].sum().reset_index()
        agg.rename(columns={fund_col: "total_cost"}, inplace=True)
    else:
        agg = df.groupby(act_col)[df.columns[0]].count().reset_index()
        agg.rename(columns={df.columns[0]: "project_count"}, inplace=True)

    return agg.sort_values(by="total_cost" if "total_cost" in agg.columns else "project_count", ascending=False)


# ---------------------------------------------------------------------
# Optional: publications / text corpus (placeholder)
# ---------------------------------------------------------------------
def load_project_publication_links() -> pd.DataFrame:
    """
    Optional: load NIH ExPORTER project-publication link file if available, e.g.:
        RePORTER_PUB_C_FY2019.csv
    Place it in data/raw and adjust filename as needed.

    Returns a DataFrame linking project numbers to PMIDs.
    """
    # Adjust the filename to whatever you have locally
    link_files = list(RAW_DATA_DIR.glob("RePORTER_PUB_C_FY*.csv"))
    if not link_files:
        print("[INFO] No publication link files found; skipping publication step.")
        return pd.DataFrame()

    dfs = []
    for f in link_files:
        print(f"[INFO] Reading pub link file {f.name}")
        df = pd.read_csv(f, dtype=str, low_memory=False)
        dfs.append(df)
    links = pd.concat(dfs, ignore_index=True)
    return links


# ---------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------
def run_pipeline(cfg: PipelineConfig) -> None:
    """
    End-to-end pipeline:

    1. Load NIH project data for FY range.
    2. Filter by FY, keyword, RCDC term.
    3. Build and export summary tables.
    4. (Optional) Load project-publication links for bibliometrics.
    """
    print(f"[INFO] Running NIH biomedical pipeline for FY {cfg.fy_start}-{cfg.fy_end}")
    projects = load_nih_projects(cfg)
    projects = filter_by_fy(projects, cfg)
    projects = filter_by_keyword(projects, cfg)
    projects = filter_by_rcdc(projects, cfg)

    if projects.empty:
        print("[WARN] No projects after filters; exiting.")
        return

    # Core summaries
    fy_summary = summarize_by_fy(projects)
    ic_summary = summarize_by_ic(projects)
    mech_summary = summarize_by_mechanism(projects)

    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

    # Export filtered projects and summaries
    projects_out = PROCESSED_DATA_DIR / f"{cfg.out_prefix}_projects_fy{cfg.fy_start}_{cfg.fy_end}.csv"
    fy_out = PROCESSED_DATA_DIR / f"{cfg.out_prefix}_fy_summary.csv"
    ic_out = PROCESSED_DATA_DIR / f"{cfg.out_prefix}_ic_summary.csv"
    mech_out = PROCESSED_DATA_DIR / f"{cfg.out_prefix}_mechanism_summary.csv"

    projects.to_csv(projects_out, index=False)
    fy_summary.to_csv(fy_out, index=False)
    ic_summary.to_csv(ic_out, index=False)
    mech_summary.to_csv(mech_out, index=False)

    print(f"[INFO] Exported filtered projects to: {projects_out}")
    print(f"[INFO] Exported FY summary to:     {fy_out}")
    print(f"[INFO] Exported IC summary to:     {ic_out}")
    print(f"[INFO] Exported mechanism summary to: {mech_out}")

    # Optional: project-publication links
    pub_links = load_project_publication_links()
    if not pub_links.empty:
        # TODO: join pub_links to projects, build corpus, run topic modeling, etc.
        # Example join key may be "PROJECT_NUMBER" or "APPLICATION_ID"
        print(f"[INFO] Loaded {len(pub_links):,} project-publication links (implement join in a notebook).")


def parse_args() -> PipelineConfig:
    parser = argparse.ArgumentParser(description="NIH Biomedical Data Science Pipeline")
    parser.add_argument("--fy-start", type=int, required=True, help="Start fiscal year (e.g., 2019)")
    parser.add_argument("--fy-end", type=int, required=True, help="End fiscal year (e.g., 2024)")
    parser.add_argument("--keyword", type=str, default=None, help="Text keyword for title/abstract filter")
    parser.add_argument("--rcdc", type=str, default=None, help="RCDC term or project term substring filter")
    parser.add_argument("--out-prefix", type=str, default="nih_biomed", help="Prefix for output files")

    args = parser.parse_args()
    return PipelineConfig(
        fy_start=args.fy_start,
        fy_end=args.fy_end,
        keyword=args.keyword,
        rcdc_term=args.rcdc,
        out_prefix=args.out_prefix,
    )


if __name__ == "__main__":
    cfg_ = parse_args()
    run_pipeline(cfg_)