## Setup

In [10]:
from pathlib import Path
import altair as alt
import pandas as pd
from yiutils.project_utils import find_project_root

PROJECT_ROOT = find_project_root("docker-compose.yml")
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
TRAIT_ANALYSIS_DIR = PROCESSED_DIR / "trait-profiles" / "analysis"
EVIDENCE_ANALYSIS_DIR = PROCESSED_DIR / "evidence-profiles" / "analysis"

MODEL_FOCUS = "gpt-5"  # primary model of interest

alt.data_transformers.enable("default", max_rows=None)
alt.themes.enable("default")

print("Project root:", PROJECT_ROOT)
print("Trait analysis dir:", TRAIT_ANALYSIS_DIR)
print("Evidence analysis dir:", EVIDENCE_ANALYSIS_DIR)
print("Model focus:", MODEL_FOCUS)


Project root: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg
Trait analysis dir: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg/data/processed/trait-profiles/analysis
Evidence analysis dir: /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/mr-kg/data/processed/evidence-profiles/analysis
Model focus: gpt-5


## Data Loading Utilities

In [11]:
def load_csv(path: Path, desc: str) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(
            f"{desc} not found: {path}\nRun generation steps first."
        )
    return pd.read_csv(path)


# Primary inputs (mirrors earlier notebook naming but we only use needed columns)
trait_similarity_dist = load_csv(
    TRAIT_ANALYSIS_DIR / "similarity-distributions.csv",
    "Trait similarity distributions",
)
trait_count_dist = load_csv(
    TRAIT_ANALYSIS_DIR / "trait-count-distributions.csv",
    "Trait count distributions",
)
trait_metric_corr = load_csv(
    TRAIT_ANALYSIS_DIR / "metric-correlations.csv", "Trait metric correlations"
)
evidence_similarity_dist = load_csv(
    EVIDENCE_ANALYSIS_DIR / "similarity-distributions.csv",
    "Evidence similarity distributions",
)
evidence_summary = load_csv(
    EVIDENCE_ANALYSIS_DIR / "summary-stats-by-model.csv",
    "Evidence summary stats",
)

print("Loaded data sets:")
for name, df in [
    ("trait_similarity_dist", trait_similarity_dist),
    ("trait_count_dist", trait_count_dist),
    ("trait_metric_corr", trait_metric_corr),
    ("evidence_similarity_dist", evidence_similarity_dist),
    ("evidence_summary", evidence_summary),
]:
    print(f"  {name}: {len(df)} rows, columns={list(df.columns)[:8]}...")


Loaded data sets:
  trait_similarity_dist: 6 rows, columns=['model', 'n_pairs', 'mean_semantic_similarity', 'median_semantic_similarity', 'sd_semantic_similarity', 'p25_semantic_similarity', 'p50_semantic_similarity', 'p75_semantic_similarity']...
  trait_count_dist: 6 rows, columns=['model', 'n_studies', 'mean_trait_count', 'median_trait_count', 'sd_trait_count', 'min_trait_count', 'max_trait_count', 'p25_trait_count']...
  trait_metric_corr: 6 rows, columns=['model', 'n_pairs', 'corr_semantic_jaccard', 'corr_semantic_query_count', 'corr_semantic_similar_count', 'corr_jaccard_query_count', 'corr_jaccard_similar_count']...
  evidence_similarity_dist: 6 rows, columns=['model', 'n_pairs', 'mean_direction_concordance', 'median_direction_concordance', 'sd_direction_concordance', 'p25_direction_concordance', 'p75_direction_concordance', 'p95_direction_concordance']...
  evidence_summary: 6 rows, columns=['model', 'total_combinations', 'avg_result_count', 'avg_completeness', 'min_result_coun

## Filter to Focus Model

In [12]:
trait_sim_focus = (
    trait_similarity_dist[trait_similarity_dist["model"] == MODEL_FOCUS].copy()
    if "model" in trait_similarity_dist.columns
    else pd.DataFrame()
)
trait_count_focus = (
    trait_count_dist[trait_count_dist["model"] == MODEL_FOCUS].copy()
    if "model" in trait_count_dist.columns
    else pd.DataFrame()
)
trait_corr_focus = (
    trait_metric_corr[trait_metric_corr["model"] == MODEL_FOCUS].copy()
    if "model" in trait_metric_corr.columns
    else pd.DataFrame()
)
evidence_sim_focus = (
    evidence_similarity_dist[
        evidence_similarity_dist["model"] == MODEL_FOCUS
    ].copy()
    if "model" in evidence_similarity_dist.columns
    else pd.DataFrame()
)
evidence_summary_focus = (
    evidence_summary[evidence_summary["model"] == MODEL_FOCUS].copy()
    if "model" in evidence_summary.columns
    else pd.DataFrame()
)

print("Focus subsets sizes:")
print("  trait_sim_focus:", len(trait_sim_focus))
print("  trait_count_focus:", len(trait_count_focus))
print("  trait_corr_focus:", len(trait_corr_focus))
print("  evidence_sim_focus:", len(evidence_sim_focus))
print("  evidence_summary_focus:", len(evidence_summary_focus))


Focus subsets sizes:
  trait_sim_focus: 1
  trait_count_focus: 1
  trait_corr_focus: 1
  evidence_sim_focus: 1
  evidence_summary_focus: 1


---
# Semantic Similarity (Trait Profiles)

### Plot 1: Distribution of Semantic Similarity Scores (gpt-5)
If per-study values not available (only per-row aggregate), density may be trivial.

In [20]:
# The similarity-distributions.csv contains ONE ROW PER MODEL with aggregated statistics
# across all study pairs, not per-study distributions.
# For gpt-5: 156,060 study pairs summarized into mean, median, sd, and percentiles.

print("‚ö†Ô∏è  DATA STRUCTURE NOTE:")
print(
    "similarity-distributions.csv contains aggregated summary statistics (one row per model),"
)
print(
    "not raw per-study similarity scores. Cannot create a distribution plot from this data.\n"
)

# Display the available summary statistics for gpt-5
if not trait_sim_focus.empty:
    print("Summary Statistics for gpt-5 Trait Profile Similarities:")
    print(
        f"  Number of study pairs: {int(trait_sim_focus['n_pairs'].iloc[0]):,}"
    )
    print(
        f"  Mean semantic similarity: {trait_sim_focus['mean_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  Median semantic similarity: {trait_sim_focus['median_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  SD semantic similarity: {trait_sim_focus['sd_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  25th percentile: {trait_sim_focus['p25_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  50th percentile: {trait_sim_focus['p50_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  75th percentile: {trait_sim_focus['p75_semantic_similarity'].iloc[0]:.4f}"
    )
    print(
        f"  95th percentile: {trait_sim_focus['p95_semantic_similarity'].iloc[0]:.4f}"
    )

    # Create a box plot representation using the percentile data
    percentile_data = pd.DataFrame(
        {
            "percentile": ["p25", "p50", "p75"],
            "value": [
                trait_sim_focus["p25_semantic_similarity"].iloc[0],
                trait_sim_focus["p50_semantic_similarity"].iloc[0],
                trait_sim_focus["p75_semantic_similarity"].iloc[0],
            ],
        }
    )

    # Show the distribution range visually
    summary_chart = (
        alt.Chart(percentile_data)
        .mark_bar()
        .encode(
            x=alt.X(
                "percentile:N", title="Percentile", axis=alt.Axis(labelAngle=0)
            ),
            y=alt.Y(
                "value:Q",
                title="Semantic Similarity",
                scale=alt.Scale(domain=[0.6, 0.9]),
            ),
            tooltip=["percentile", alt.Tooltip("value:Q", format=".4f")],
        )
        .properties(
            width=300,
            height=300,
            title="Semantic Similarity Percentiles (gpt-5)",
        )
    )

    display(summary_chart)

    print("\nüí° TO CREATE A PROPER DISTRIBUTION PLOT:")
    print(
        "   Re-run the trait profile similarity calculation with per-pair output enabled,"
    )
    print(
        "   or query the database directly for individual study-pair similarity scores."
    )
else:
    print("No trait similarity data available for gpt-5")


‚ö†Ô∏è  DATA STRUCTURE NOTE:
similarity-distributions.csv contains aggregated summary statistics (one row per model),
not raw per-study similarity scores. Cannot create a distribution plot from this data.

Summary Statistics for gpt-5 Trait Profile Similarities:
  Number of study pairs: 156,060
  Mean semantic similarity: 0.7237
  Median semantic similarity: 0.7527
  SD semantic similarity: 0.1836
  25th percentile: 0.6686
  50th percentile: 0.7527
  75th percentile: 0.8278
  95th percentile: 0.9371



üí° TO CREATE A PROPER DISTRIBUTION PLOT:
   Re-run the trait profile similarity calculation with per-pair output enabled,
   or query the database directly for individual study-pair similarity scores.


### Plot 2: Semantic Similarity vs Trait Count per Study (if possible)

In [14]:
# Heuristic merge: look for a common key (pmid or study identifier)
potential_keys = [
    k
    for k in ["pmid", "study_id", "paper_id"]
    if k in trait_sim_focus.columns and k in trait_count_focus.columns
]
print("Potential merge keys:", potential_keys)
if not potential_keys:
    print("No common key for merging semantic similarity with trait counts.")
else:
    key = potential_keys[0]
    sem_col = (
        [
            c
            for c in trait_sim_focus.columns
            if "semantic" in c and "similarity" in c
        ][0]
        if [
            c
            for c in trait_sim_focus.columns
            if "semantic" in c and "similarity" in c
        ]
        else None
    )
    if sem_col is None:
        print("Semantic similarity column not found for scatter plot.")
    else:
        merged = (
            trait_sim_focus[[key, sem_col]]
            .merge(
                trait_count_focus[[key, "trait_count"]], on=key, how="inner"
            )
            .rename(columns={sem_col: "semantic_similarity"})
        )
        print("Merged rows:", len(merged))
        scatter_sem_traits = (
            alt.Chart(merged)
            .mark_circle(opacity=0.5)
            .encode(
                x=alt.X("trait_count:Q", title="Trait Count"),
                y=alt.Y("semantic_similarity:Q", title="Semantic Similarity"),
                tooltip=["trait_count", "semantic_similarity"],
            )
            .properties(
                width=600,
                height=350,
                title="Semantic Similarity vs Trait Count (gpt-5)",
            )
        )
        display(scatter_sem_traits)


Potential merge keys: []
No common key for merging semantic similarity with trait counts.


### Plot 3: Semantic Similarity vs Jaccard (Contextual Reference, Still gpt-5)

In [15]:
if trait_corr_focus.empty:
    print("No trait correlation data for gpt-5.")
else:
    print(
        "Available columns in trait_corr_focus:",
        list(trait_corr_focus.columns),
    )
    needed = [
        c
        for c in ["semantic_similarity", "jaccard_similarity"]
        if c in trait_corr_focus.columns
    ]
    if len(needed) < 2:
        print(
            f"Missing required columns for semantic vs jaccard plot. Found: {needed}"
        )
        print("Trying case-insensitive search...")
        # Try case-insensitive matching
        col_lower = {c.lower(): c for c in trait_corr_focus.columns}
        needed_actual = []
        for target in ["semantic_similarity", "jaccard_similarity"]:
            if target.lower() in col_lower:
                needed_actual.append(col_lower[target.lower()])
        if len(needed_actual) >= 2:
            print(f"Found columns: {needed_actual}")
            corr_df = trait_corr_focus[needed_actual].dropna()
            corr_df.columns = ["semantic_similarity", "jaccard_similarity"]
            print(f"Rows for correlation plot: {len(corr_df)}")
            corr_scatter = (
                alt.Chart(corr_df)
                .mark_circle(size=60, opacity=0.5)
                .encode(
                    x=alt.X(
                        "jaccard_similarity:Q", title="Jaccard Similarity"
                    ),
                    y=alt.Y(
                        "semantic_similarity:Q", title="Semantic Similarity"
                    ),
                    tooltip=["jaccard_similarity", "semantic_similarity"],
                )
                .properties(
                    width=600,
                    height=350,
                    title="Semantic vs Jaccard Similarity (gpt-5)",
                )
            )
            corr_reg = corr_scatter.transform_regression(
                "jaccard_similarity", "semantic_similarity"
            ).mark_line(color="orange")
            display(corr_scatter + corr_reg)
        else:
            print(
                "Could not find required columns even with case-insensitive search."
            )
    else:
        corr_df = trait_corr_focus[needed].dropna()
        print(f"Rows for correlation plot: {len(corr_df)}")
        corr_scatter = (
            alt.Chart(corr_df)
            .mark_circle(size=60, opacity=0.5)
            .encode(
                x=alt.X("jaccard_similarity:Q", title="Jaccard Similarity"),
                y=alt.Y("semantic_similarity:Q", title="Semantic Similarity"),
                tooltip=["jaccard_similarity", "semantic_similarity"],
            )
            .properties(
                width=600,
                height=350,
                title="Semantic vs Jaccard Similarity (gpt-5)",
            )
        )
        corr_reg = corr_scatter.transform_regression(
            "jaccard_similarity", "semantic_similarity"
        ).mark_line(color="orange")
        display(corr_scatter + corr_reg)


Available columns in trait_corr_focus: ['model', 'n_pairs', 'corr_semantic_jaccard', 'corr_semantic_query_count', 'corr_semantic_similar_count', 'corr_jaccard_query_count', 'corr_jaccard_similar_count']
Missing required columns for semantic vs jaccard plot. Found: []
Trying case-insensitive search...
Could not find required columns even with case-insensitive search.


---
# Direction Concordance (Evidence Profiles)

### Plot 4: Distribution of Direction Concordance (gpt-5)

In [16]:
dir_cols = [
    c
    for c in evidence_sim_focus.columns
    if "direction" in c and "concordance" in c
]
print("Direction concordance columns:", dir_cols)
if not dir_cols:
    print("No direction concordance data found for gpt-5 evidence.")
else:
    dcol = dir_cols[0]
    df_dir = (
        evidence_sim_focus[[dcol]]
        .dropna()
        .rename(columns={dcol: "direction_concordance"})
    )
    dir_density = (
        alt.Chart(df_dir)
        .transform_density(
            "direction_concordance", as_=["direction_concordance", "density"]
        )
        .mark_area(opacity=0.6)
        .encode(
            x=alt.X("direction_concordance:Q", title="Direction Concordance"),
            y=alt.Y("density:Q", title="Density"),
        )
        .properties(
            width=600,
            height=300,
            title="Direction Concordance Distribution (gpt-5)",
        )
    )
    display(dir_density)


Direction concordance columns: ['mean_direction_concordance', 'median_direction_concordance', 'sd_direction_concordance', 'p25_direction_concordance', 'p75_direction_concordance', 'p95_direction_concordance']


### Plot 5: Direction Concordance vs Composite Similarity

In [17]:
# Try both raw and summary sources for composite similarity
comp_cols = [c for c in evidence_sim_focus.columns if "composite" in c]
if not comp_cols and not evidence_summary_focus.empty:
    comp_cols = [c for c in evidence_summary_focus.columns if "composite" in c]
print("Composite similarity columns candidate:", comp_cols)
if not comp_cols or not dir_cols:
    print("Cannot plot composite vs direction concordance (missing columns).")
else:
    dcol = dir_cols[0]
    ccol = comp_cols[0]
    src_df = (
        evidence_sim_focus
        if ccol in evidence_sim_focus.columns
        else evidence_summary_focus
    )
    scatter_df = (
        src_df[[dcol, ccol]]
        .dropna()
        .rename(
            columns={
                dcol: "direction_concordance",
                ccol: "composite_similarity",
            }
        )
    )
    comp_scatter = (
        alt.Chart(scatter_df)
        .mark_circle(size=60, opacity=0.5)
        .encode(
            x=alt.X("direction_concordance:Q", title="Direction Concordance"),
            y=alt.Y("composite_similarity:Q", title="Composite Similarity"),
            tooltip=["direction_concordance", "composite_similarity"],
        )
        .properties(
            width=600,
            height=350,
            title="Direction Concordance vs Composite Similarity (gpt-5)",
        )
    )
    comp_reg = comp_scatter.transform_regression(
        "direction_concordance", "composite_similarity"
    ).mark_line(color="orange")
    display(comp_scatter + comp_reg)


Composite similarity columns candidate: ['mean_composite_equal', 'median_composite_equal', 'sd_composite_equal', 'p25_composite_equal', 'p75_composite_equal', 'p95_composite_equal', 'mean_composite_direction', 'median_composite_direction', 'sd_composite_direction', 'p25_composite_direction', 'p75_composite_direction', 'p95_composite_direction']


### Plot 6: Linking Trait Semantic Similarity to Evidence Direction Concordance (Exploratory)
Requires common study identifier across trait and evidence datasets.

In [18]:
shared_keys = [
    k
    for k in ["pmid", "study_id", "paper_id"]
    if k in trait_sim_focus.columns and k in evidence_sim_focus.columns
]
print("Shared keys between trait and evidence focus frames:", shared_keys)
if not shared_keys:
    print("No shared identifier found; skipping cross-link plot.")
else:
    sem_col = (
        [
            c
            for c in trait_sim_focus.columns
            if "semantic" in c and "similarity" in c
        ][0]
        if [
            c
            for c in trait_sim_focus.columns
            if "semantic" in c and "similarity" in c
        ]
        else None
    )
    dir_col = dir_cols[0] if dir_cols else None
    if sem_col is None or dir_col is None:
        print("Missing semantic or direction column for cross-link.")
    else:
        key = shared_keys[0]
        cross_df = (
            trait_sim_focus[[key, sem_col]]
            .merge(evidence_sim_focus[[key, dir_col]], on=key, how="inner")
            .rename(
                columns={
                    sem_col: "semantic_similarity",
                    dir_col: "direction_concordance",
                }
            )
            .dropna()
        )
        print(f"Cross-linked rows: {len(cross_df)}")
        if len(cross_df) > 0:
            cross_scatter = (
                alt.Chart(cross_df)
                .mark_circle(size=60, opacity=0.5)
                .encode(
                    x=alt.X(
                        "semantic_similarity:Q",
                        title="Semantic Similarity (Trait)",
                    ),
                    y=alt.Y(
                        "direction_concordance:Q",
                        title="Direction Concordance (Evidence)",
                    ),
                    tooltip=["semantic_similarity", "direction_concordance"],
                )
                .properties(
                    width=600,
                    height=350,
                    title="Trait Semantic Similarity vs Evidence Direction Concordance (gpt-5)",
                )
            )
            cross_reg = cross_scatter.transform_regression(
                "semantic_similarity", "direction_concordance"
            ).mark_line(color="orange")
            display(cross_scatter + cross_reg)
        else:
            print("No data available for cross-link plot.")


Shared keys between trait and evidence focus frames: []
No shared identifier found; skipping cross-link plot.


---
## Export (Optional)

In [None]:
# Uncomment to export selected charts
# export_dir = PROCESSED_DIR / 'figures' / 'mr-kg-focused'
# export_dir.mkdir(parents=True, exist_ok=True)
# if 'semantic_density' in globals(): semantic_density.save(str(export_dir / 'semantic_density.json'))
# if 'scatter_sem_traits' in globals(): scatter_sem_traits.save(str(export_dir / 'semantic_vs_trait_count.json'))
# if 'corr_scatter' in globals(): (corr_scatter + corr_reg).save(str(export_dir / 'semantic_vs_jaccard.json'))
# if 'dir_density' in globals(): dir_density.save(str(export_dir / 'direction_concordance_density.json'))
# if 'comp_scatter' in globals(): (comp_scatter + comp_reg).save(str(export_dir / 'direction_vs_composite.json'))
# if 'cross_scatter' in globals(): (cross_scatter + cross_reg).save(str(export_dir / 'semantic_vs_direction_cross.json'))
# print('Exports complete:', export_dir)


## Notes & Further Ideas
- Consider time-based trends if publication year is added.
- Could stratify by trait domain if a mapping file is introduced.
- Extend evidence composite metric weighting sensitivity study.