Aggregate processed LLM results and produce a sample,
and add openai reults in as well

In [1]:
%load_ext autoreload
%autoreload 2

import json

import pandas as pd

from yiutils.project_utils import find_project_root


In [2]:
proj_root = find_project_root(anchor_file="justfile")
data_dir = proj_root / "data"
assert data_dir.exists(), f"Data directory {data_dir} does not exist."


In [24]:
path_to_processed_mr_pubmed_data = (
    data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_processed_mr_pubmed_data.exists(), (
    f"Processed MR PubMed data file {path_to_processed_mr_pubmed_data} does not exist."
)

with path_to_processed_mr_pubmed_data.open("r") as f:
    mr_pubmed_json = json.load(f)
    mr_pubmed_df = pd.DataFrame(mr_pubmed_json).assign(
        pmid=lambda x: x["pmid"].astype(str)
    )

mr_pubmed_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15635 entries, 0 to 15634
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   pmid          15635 non-null  object
 1   ab            15635 non-null  object
 2   pub_date      15635 non-null  object
 3   title         15635 non-null  object
 4   journal_issn  15434 non-null  object
 5   journal       15635 non-null  object
 6   author_affil  15473 non-null  object
dtypes: object(7)
memory usage: 855.2+ KB


In [25]:
path_to_ds_r1_results = (
    data_dir
    / "intermediate"
    / "llm-results-aggregated"
    / "deepseek-r1-distilled"
    / "processed_results.json"
)
assert path_to_ds_r1_results.exists()

df_ds_r1 = (
    pd.read_json(path_to_ds_r1_results, orient="records")
    .dropna(subset=["metadata", "results"])
    .assign(pmid=lambda x: x["pmid"].astype(str))
)
df_ds_r1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6184 entries, 0 to 6499
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   pmid               6184 non-null   object
 1   ab                 6184 non-null   object
 2   title              6184 non-null   object
 3   metadata_thinking  6184 non-null   object
 4   metadata           6184 non-null   object
 5   results_thinking   6184 non-null   object
 6   results            6184 non-null   object
dtypes: object(7)
memory usage: 386.5+ KB


In [26]:
path_to_llama3_results = (
    data_dir
    / "intermediate"
    / "llm-results-aggregated"
    / "llama3"
    / "processed_results.json"
)
assert path_to_llama3_results.exists()

df_llama3 = (
    pd.read_json(path_to_llama3_results, orient="records")
    .dropna(subset=["metadata", "results"])
    .assign(pmid=lambda x: x["pmid"].astype(str))
)
df_llama3.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6901 entries, 0 to 6999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pmid      6901 non-null   object
 1   ab        6901 non-null   object
 2   title     6901 non-null   object
 3   metadata  6901 non-null   object
 4   results   6901 non-null   object
dtypes: object(5)
memory usage: 323.5+ KB


In [27]:
path_to_llama3_2_results = (
    data_dir
    / "intermediate"
    / "llm-results-aggregated"
    / "llama3-2"
    / "processed_results.json"
)
assert path_to_llama3_2_results.exists()

df_llama3_2 = (
    pd.read_json(path_to_llama3_2_results, orient="records")
    .dropna(subset=["metadata", "results"])
    .assign(pmid=lambda x: x["pmid"].astype(str))
)
df_llama3_2.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6720 entries, 0 to 6999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pmid      6720 non-null   object
 1   ab        6720 non-null   object
 2   title     6720 non-null   object
 3   metadata  6720 non-null   object
 4   results   6720 non-null   object
dtypes: object(5)
memory usage: 315.0+ KB


----

# aggregate analysis

In [28]:
ids_ds_r1 = set(df_ds_r1["pmid"].unique())
ids_llama3 = set(df_llama3["pmid"].unique())
ids_llama3_2 = set(df_llama3_2["pmid"].unique())


In [29]:
intersection = ids_ds_r1.intersection(ids_llama3).intersection(ids_llama3_2)
print(f"Number of intersecting IDs: {len(intersection)}")


Number of intersecting IDs: 5901


# Sample output

In [35]:
sample_ids = pd.Series(list(intersection)).sample(n=5, random_state=42)
print(f"Sampled IDs: {sample_ids.tolist()}")


Sampled IDs: ['33261611', '28389615', '35624721', '37836511', '37876930']


In [36]:
pubmed_data = mr_pubmed_df[mr_pubmed_df["pmid"].isin(sample_ids)]
sample_ds_r1 = df_ds_r1[df_ds_r1["pmid"].isin(sample_ids)]
sample_llama3 = df_llama3[df_llama3["pmid"].isin(sample_ids)]
sample_llama3_2 = df_llama3_2[df_llama3_2["pmid"].isin(sample_ids)]


In [39]:
# output_data = {
#     "pubmed_data": pubmed_data.to_dict(orient="records"),
#     "models": {
#         "ds_r1": sample_ds_r1.to_dict(orient="records"),
#         "llama3": sample_llama3.to_dict(orient="records"),
#         "llama3_2": sample_llama3_2.to_dict(orient="records"),
#     },
# }

# New code to create output_data as a list of dicts, one per pmid, removing "pmid", "ab", "title" from model results
output_data = []
for pmid in sample_ids:
    pubmed_row = pubmed_data[pubmed_data["pmid"] == pmid].iloc[0].to_dict()
    ds_r1_row = sample_ds_r1[sample_ds_r1["pmid"] == pmid].iloc[0].to_dict()
    llama3_row = sample_llama3[sample_llama3["pmid"] == pmid].iloc[0].to_dict()
    llama3_2_row = sample_llama3_2[sample_llama3_2["pmid"] == pmid].iloc[0].to_dict()
    for d in (ds_r1_row, llama3_row, llama3_2_row):
        for k in ["pmid", "ab", "title"]:
            d.pop(k, None)
    output_data.append(
        {
            "pubmed_data": pubmed_row,
            "result": {
                "ds_r1": ds_r1_row,
                "llama3": llama3_row,
                "llama3_2": llama3_2_row,
            },
        }
    )


In [40]:
output_dir = proj_root / "data" / "artifacts" / "sample_visualization"
output_dir.mkdir(parents=True, exist_ok=True)

path_to_output = output_dir / "data_sample.json"
with path_to_output.open("w") as f:
    json.dump(output_data, f, indent=2)


In [41]:
path_to_output = output_dir / "data_sample_base.json"
with path_to_output.open("w") as f:
    json.dump(output_data[0], f, indent=2)
