Process the batch processing results for
- deepseek-r1-distilled
- llama3.2

In [4]:
%load_ext autoreload
%autoreload 2

import json
from pprint import pprint

import pandas as pd

from yiutils.project_utils import find_project_root
from local_funcs import parsers


# params


In [5]:
proj_root = find_project_root("justfile")
print(proj_root)

data_dir = proj_root / "data"
assert data_dir.exists()

output_dir = data_dir / "intermediate" / "llm-results-aggregated"
output_dir.mkdir(parents=True, exist_ok=True)

llm_results_dir = data_dir / "intermediate" / "llm-results"


/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction


In [6]:
path_to_ds_r1_result_dir = llm_results_dir / "isb-ai-117256" / "results"
assert path_to_ds_r1_result_dir.exists()

path_to_llama3_2_result_dir = llm_results_dir / "isb-ai-117535" / "results"
assert path_to_llama3_2_result_dir.exists()


# Deepseek-r1-distilled

## Load raw data


In [13]:
json_files = list(path_to_ds_r1_result_dir.glob("*.json"))
print(len(json_files))

json_data = []
for json_file in json_files:
    with open(json_file, "r") as f:
        data = {"data": json.load(f), "filename": str(json_file.name)}
        json_data.append(data)

raw_results_df = pd.concat(
    [
        pd.DataFrame(data["data"]).assign(filename=data["filename"])
        for data in json_data
    ],
).reset_index(drop=True)
raw_results_df.info()
raw_results_df


65
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   pmid                 6500 non-null   object
 1   ab                   6500 non-null   object
 2   pub_date             6500 non-null   object
 3   title                6500 non-null   object
 4   journal_issn         6449 non-null   object
 5   journal              6500 non-null   object
 6   author_affil         6436 non-null   object
 7   completion_metadata  6500 non-null   object
 8   completion_results   6500 non-null   object
 9   filename             6500 non-null   object
dtypes: object(10)
memory usage: 507.9+ KB


Unnamed: 0,pmid,ab,pub_date,title,journal_issn,journal,author_affil,completion_metadata,completion_results,filename
0,37553610,Major depressive disorder (MDD) is a highly he...,2022-12-17,Causal association between major depressive di...,1755-8794,BMC medical genomics,"Department of Health Statistics, School of Pub...","Alright, so I need to figure out how to respon...","Alright, I need to help the user by extracting...",mr_extract_array_25.json
1,37553064,Lipoprotein (a) (Lp[a]) is an established risk...,2023-08-03,Association Between Lipoprotein (a) and Risk o...,1535-6280,Current problems in cardiology,"Departments of Clinical Epidemiology, Biostati...","Alright, I need to figure out how to respond t...","Alright, so I'm trying to figure out how to re...",mr_extract_array_25.json
2,37552661,Observational studies suggest that electrocard...,2022-11-12,Multiple anthropometric measures and proarrhyt...,1549-1676,PLoS medicine,"National Heart and Lung Institute, Imperial Co...","Alright, I need to figure out how to respond t...","Alright, so I need to extract the results from...",mr_extract_array_25.json
3,37550780,Previous observational studies have reported t...,2023-06-19,Causal relationships between delirium and Alzh...,2047-783X,European journal of medical research,"Department of Anesthesiology, Chongqing Emerge...","Alright, so I need to figure out how to respon...","Alright, so I need to extract specific informa...",mr_extract_array_25.json
4,37549427,"Many studies reported that lifestyle, psychoso...",2023-04-09,"Causal Roles of Lifestyle, Psychosocial Charac...",1758-535X,"The journals of gerontology. Series A, Biologi...","Department of Traumatic Surgery, Shanghai East...","Alright, so I need to figure out the exposures...","Alright, so I need to extract specific informa...",mr_extract_array_25.json
...,...,...,...,...,...,...,...,...,...,...
6495,36973792,Patients with rheumatoid arthritis (RA) have a...,2023-01-16,The genetic liability to rheumatoid arthritis ...,1478-6362,Arthritis research & therapy,"Guangzhou Medical University, Guangzhou, 51143...","Alright, so I'm trying to figure out how to re...","Alright, so I need to extract the results from...",mr_extract_array_30.json
6496,36971839,Asthma is a common respiratory disease caused ...,2022-12-31,The Association Between Insulin Use and Asthma...,1432-1750,Lung,"State Key Laboratory of Respiratory Disease, N...","Alright, I need to figure out how to respond t...","Alright, so I need to extract the results from...",mr_extract_array_30.json
6497,36970527,Though omega-3 fatty acids reduce seizures in ...,2022-12-29,Causal relationship between human blood omega-...,1664-2295,Frontiers in neurology,"Department of Neurology, China-Japan Union Hos...","Alright, I need to figure out how to respond t...","Alright, so I'm trying to help the user by ext...",mr_extract_array_30.json
6498,36969605,The protection of physical activity (PA) again...,2022-11-04,The causal effect of physical activity intensi...,1664-042X,Frontiers in physiology,"Institute of Sports Science, College of Physic...","Alright, so I need to figure out how to respon...","Alright, so I need to extract the results from...",mr_extract_array_30.json


In [10]:
output_path = output_dir / "deepseek-r1-distilled" / "raw_results.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w") as f:
    raw_results_df.to_json(f, orient="records", indent=2)


## Process results


### markdown parsing


In [None]:
cell = raw_results_df["completion_metadata"].iloc[0]
print(cell)


Alright, so I need to figure out how to respond to the user's query. They provided an abstract from a Mendelian randomization study on Major depressive disorder (MDD) and Coronary Heart Disease (CHD). The user wants me to extract specific information and present it in a JSON format with certain categories and methods.

First, I'll start by identifying the exposures and outcomes mentioned in the abstract. The abstract talks about MDD and CHD. So, MDD is definitely an exposure, and CHD is the outcome. I don't see any other exposures or outcomes mentioned, so I'll list them as single items.

Next, I need to categorize these into the provided groups. MDD falls under "mental disorder," and CHD is part of the "circulatory system." So, the categories should be straightforward.

Now, looking at the analytical methods used. The abstract mentions a "two-sample Bidirectional Mendelian Randomization Study." That matches exactly with one of the method names in the list. There's also mention of meth

In [None]:
json_data = parsers.extract_json_from_markdown(cell)
pprint(json_data)


{'metadata': {'exposures': [{'category': 'mental disorder',
                             'id': '1',
                             'trait': 'Major depressive disorder (MDD)'}],
              'metainformation': {'error': 'No information on population is '
                                           'provided in the abstract',
                                  'explanation': 'The population is not '
                                                 'explicitly mentioned in the '
                                                 'abstract.'},
              'methods': [{'description': 'The study used a two-sample '
                                          'bidirectional Mendelian '
                                          'Randomization approach to assess '
                                          'the causal relationship between MDD '
                                          'and CHD.',
                           'id': '1',
                           'name': 'two-sample mendelian randomiza

In [None]:
thinking_text = parsers.extract_thinking(cell)
print(thinking_text)


Alright, so I need to figure out how to respond to the user's query. They provided an abstract from a Mendelian randomization study on Major depressive disorder (MDD) and Coronary Heart Disease (CHD). The user wants me to extract specific information and present it in a JSON format with certain categories and methods.

First, I'll start by identifying the exposures and outcomes mentioned in the abstract. The abstract talks about MDD and CHD. So, MDD is definitely an exposure, and CHD is the outcome. I don't see any other exposures or outcomes mentioned, so I'll list them as single items.

Next, I need to categorize these into the provided groups. MDD falls under "mental disorder," and CHD is part of the "circulatory system." So, the categories should be straightforward.

Now, looking at the analytical methods used. The abstract mentions a "two-sample Bidirectional Mendelian Randomization Study." That matches exactly with one of the method names in the list. There's also mention of meth

### batch process

In [14]:
results_df = raw_results_df.assign(
    metadata_thinking=lambda df: df["completion_metadata"].apply(
        parsers.extract_thinking
    ),
    metadata=lambda df: df["completion_metadata"].apply(
        parsers.extract_json_from_markdown
    ),
    results_thinking=lambda df: df["completion_results"].apply(
        parsers.extract_thinking
    ),
    results=lambda df: df["completion_results"].apply(
        parsers.extract_json_from_markdown
    ),
)[
    [
        "pmid",
        "ab",
        "title",
        "metadata_thinking",
        "metadata",
        "results_thinking",
        "results",
    ]
]

results_df.info()
results_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   pmid               6500 non-null   object
 1   ab                 6500 non-null   object
 2   title              6500 non-null   object
 3   metadata_thinking  6500 non-null   object
 4   metadata           6469 non-null   object
 5   results_thinking   6500 non-null   object
 6   results            6214 non-null   object
dtypes: object(7)
memory usage: 355.6+ KB


Unnamed: 0,pmid,ab,title,metadata_thinking,metadata,results_thinking,results
0,37553610,Major depressive disorder (MDD) is a highly he...,Causal association between major depressive di...,"Alright, so I need to figure out how to respon...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, I need to help the user by extracting...",{'results': [{'exposure': 'Major depressive di...
1,37553064,Lipoprotein (a) (Lp[a]) is an established risk...,Association Between Lipoprotein (a) and Risk o...,"Alright, I need to figure out how to respond t...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I'm trying to figure out how to re...",{'results': [{'exposure': 'Lipoprotein (a) (Lp...
2,37552661,Observational studies suggest that electrocard...,Multiple anthropometric measures and proarrhyt...,"Alright, I need to figure out how to respond t...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I need to extract the results from...",
3,37550780,Previous observational studies have reported t...,Causal relationships between delirium and Alzh...,"Alright, so I need to figure out how to respon...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I need to extract specific informa...","{'results': [{'exposure': 'Delirium', 'outcome..."
4,37549427,"Many studies reported that lifestyle, psychoso...","Causal Roles of Lifestyle, Psychosocial Charac...","Alright, so I need to figure out the exposures...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I need to extract specific informa...","{'results': [{'exposure': 'Coffee intake', 'ou..."
...,...,...,...,...,...,...,...
6495,36973792,Patients with rheumatoid arthritis (RA) have a...,The genetic liability to rheumatoid arthritis ...,"Alright, so I'm trying to figure out how to re...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I need to extract the results from...",{'results': [{'exposure': 'Rheumatoid Arthriti...
6496,36971839,Asthma is a common respiratory disease caused ...,The Association Between Insulin Use and Asthma...,"Alright, I need to figure out how to respond t...",,"Alright, so I need to extract the results from...","{'results': [{'exposure': 'insulin use', 'outc..."
6497,36970527,Though omega-3 fatty acids reduce seizures in ...,Causal relationship between human blood omega-...,"Alright, I need to figure out how to respond t...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I'm trying to help the user by ext...",{'results': [{'exposure': 'Blood omega-3 fatty...
6498,36969605,The protection of physical activity (PA) again...,The causal effect of physical activity intensi...,"Alright, so I need to figure out how to respon...","{'metadata': {'exposures': [{'id': '1', 'trait...","Alright, so I need to extract the results from...",{'results': [{'exposure': 'Light physical acti...


In [None]:
output_path = output_dir / "deepseek-r1-distilled" / "processed_results.json"

with open(output_path, "w") as f:
    results_df.to_json(f, orient="records", indent=2)


In [15]:
output_path = output_dir / "deepseek-r1-distilled" / "processed_results_sample.json"

results_df[:10].to_json(output_path, orient="records", indent=2)


# llama 3.2

## Load raw data

In [7]:
json_files = list(path_to_llama3_2_result_dir.glob("*.json"))
print(len(json_files))

json_data = []
for json_file in json_files:
    with open(json_file, "r") as f:
        data = {"data": json.load(f), "filename": str(json_file.name)}
        json_data.append(data)

raw_results_df = pd.concat(
    [
        pd.DataFrame(data["data"]).assign(filename=data["filename"])
        for data in json_data
    ],
).reset_index(drop=True)
raw_results_df.info()
raw_results_df


70
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   pmid                 7000 non-null   object
 1   ab                   7000 non-null   object
 2   pub_date             7000 non-null   object
 3   title                7000 non-null   object
 4   journal_issn         6942 non-null   object
 5   journal              7000 non-null   object
 6   author_affil         6932 non-null   object
 7   completion_metadata  7000 non-null   object
 8   completion_results   7000 non-null   object
 9   filename             7000 non-null   object
dtypes: object(10)
memory usage: 547.0+ KB


Unnamed: 0,pmid,ab,pub_date,title,journal_issn,journal,author_affil,completion_metadata,completion_results,filename
0,37553610,Major depressive disorder (MDD) is a highly he...,2022-12-17,Causal association between major depressive di...,1755-8794,BMC medical genomics,"Department of Health Statistics, School of Pub...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_25.json
1,37553064,Lipoprotein (a) (Lp[a]) is an established risk...,2023-08-03,Association Between Lipoprotein (a) and Risk o...,1535-6280,Current problems in cardiology,"Departments of Clinical Epidemiology, Biostati...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_25.json
2,37552661,Observational studies suggest that electrocard...,2022-11-12,Multiple anthropometric measures and proarrhyt...,1549-1676,PLoS medicine,"National Heart and Lung Institute, Imperial Co...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_25.json
3,37550780,Previous observational studies have reported t...,2023-06-19,Causal relationships between delirium and Alzh...,2047-783X,European journal of medical research,"Department of Anesthesiology, Chongqing Emerge...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_25.json
4,37549427,"Many studies reported that lifestyle, psychoso...",2023-04-09,"Causal Roles of Lifestyle, Psychosocial Charac...",1758-535X,"The journals of gerontology. Series A, Biologi...","Department of Traumatic Surgery, Shanghai East...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_25.json
...,...,...,...,...,...,...,...,...,...,...
6995,36973792,Patients with rheumatoid arthritis (RA) have a...,2023-01-16,The genetic liability to rheumatoid arthritis ...,1478-6362,Arthritis research & therapy,"Guangzhou Medical University, Guangzhou, 51143...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_30.json
6996,36971839,Asthma is a common respiratory disease caused ...,2022-12-31,The Association Between Insulin Use and Asthma...,1432-1750,Lung,"State Key Laboratory of Respiratory Disease, N...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_30.json
6997,36970527,Though omega-3 fatty acids reduce seizures in ...,2022-12-29,Causal relationship between human blood omega-...,1664-2295,Frontiers in neurology,"Department of Neurology, China-Japan Union Hos...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_30.json
6998,36969605,The protection of physical activity (PA) again...,2022-11-04,The causal effect of physical activity intensi...,1664-042X,Frontiers in physiology,"Institute of Sports Science, College of Physic...","{\n ""exposures"": [\n {\n ""id"": ""1"",\n...","{\n ""results"": [\n {\n ""e...",mr_extract_array_30.json


In [8]:
output_path = output_dir / "llama3-2" / "raw_results.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w") as f:
    raw_results_df.to_json(f, orient="records", indent=2)


## Process results

In [9]:
results_df = raw_results_df.assign(
    metadata=lambda df: df["completion_metadata"].apply(parsers.parse_json),
    results=lambda df: df["completion_results"].apply(parsers.parse_json),
)[
    [
        "pmid",
        "ab",
        "title",
        "metadata",
        "results",
    ]
]

results_df.info()
results_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pmid      7000 non-null   object
 1   ab        7000 non-null   object
 2   title     7000 non-null   object
 3   metadata  6978 non-null   object
 4   results   6741 non-null   object
dtypes: object(5)
memory usage: 273.6+ KB


Unnamed: 0,pmid,ab,title,metadata,results
0,37553610,Major depressive disorder (MDD) is a highly he...,Causal association between major depressive di...,"{'exposures': [{'id': '1', 'trait': 'Major dep...",{'results': [{'exposure': 'Major depressive di...
1,37553064,Lipoprotein (a) (Lp[a]) is an established risk...,Association Between Lipoprotein (a) and Risk o...,"{'exposures': [{'id': '1', 'trait': 'Lipoprote...","{'results': [{'exposure': 'Lipid (a)', 'outcom..."
2,37552661,Observational studies suggest that electrocard...,Multiple anthropometric measures and proarrhyt...,"{'exposures': [{'id': '1', 'trait': 'Body mass...","{'results': [{'exposure': 'BMI', 'outcome': 'P..."
3,37550780,Previous observational studies have reported t...,Causal relationships between delirium and Alzh...,"{'exposures': [{'id': '1', 'trait': 'Delirium'...","{'results': [{'exposure': 'delirium', 'outcome..."
4,37549427,"Many studies reported that lifestyle, psychoso...","Causal Roles of Lifestyle, Psychosocial Charac...","{'exposures': [{'id': '1', 'trait': 'coffee in...","{'results': [{'exposure': 'coffee intake', 'ou..."
...,...,...,...,...,...
6995,36973792,Patients with rheumatoid arthritis (RA) have a...,The genetic liability to rheumatoid arthritis ...,"{'exposures': [{'id': '1', 'trait': 'Rheumatoi...",{'results': [{'exposure': 'Rheumatoid Arthriti...
6996,36971839,Asthma is a common respiratory disease caused ...,The Association Between Insulin Use and Asthma...,"{'exposures': [{'id': '1', 'trait': 'Insulin u...","{'results': [{'exposure': 'Insulin use', 'outc..."
6997,36970527,Though omega-3 fatty acids reduce seizures in ...,Causal relationship between human blood omega-...,"{'exposures': [{'id': '1', 'trait': 'omega-3 f...",{'results': [{'exposure': 'genetically determi...
6998,36969605,The protection of physical activity (PA) again...,The causal effect of physical activity intensi...,"{'exposures': [{'id': '1', 'trait': 'Physical ...",{'results': [{'exposure': 'Light physical acti...


In [10]:
output_path = output_dir / "llama3-2" / "processed_results.json"

with open(output_path, "w") as f:
    results_df.to_json(f, orient="records", indent=2)


In [None]:
output_path = output_dir / "llama3-2" / "processed_results_sample.json"

results_df[:10].to_json(output_path, orient="records", indent=2)
