# Single case extraction by OpenAI models
Converted from marimo app to Jupyter notebook.

In [39]:
%load_ext autoreload
%autoreload 2

# Module init: locate script and import dynamically
import importlib.util
import json
from pprint import pprint
from tqdm import tqdm

from local_funcs import parsers
from yiutils.project_utils import find_project_root
PROJECT_ROOT = find_project_root("justfile")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
_path_to_script = PROJECT_ROOT / "scripts" / "python" / "extract-data-openai.py"

spec = importlib.util.spec_from_file_location("extract_data", str(_path_to_script))
extract_data = importlib.util.module_from_spec(spec) # type: ignore
spec.loader.exec_module(extract_data)  # type: ignore

print(f"Loaded extract_data from {_path_to_script}")

# Access functions from the extracted module
get_config = extract_data.get_config
load_schema_data = extract_data.load_schema_data
setup_openai_client = extract_data.setup_openai_client
process_abstract = extract_data.process_abstract


Loaded extract_data from /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/scripts/python/extract-data-openai.py


In [33]:
_path_to_script = PROJECT_ROOT / "scripts" / "python" / "process-llm-aggregated-results.py"

spec = importlib.util.spec_from_file_location("extract_data", str(_path_to_script))
agg_results = importlib.util.module_from_spec(spec) # type: ignore
spec.loader.exec_module(agg_results)  # type: ignore

print(f"Loaded extract_data from {_path_to_script}")

validate_item_with_schema = agg_results.validate_item_with_schema
process_metadata = agg_results.process_metadata
process_results = agg_results.process_results


Loaded extract_data from /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/scripts/python/process-llm-aggregated-results.py


# Init

In [None]:
# Mock CLI args (pilot mode)
class MockArgs:
    def __init__(self):
        self.output_dir = PROJECT_ROOT / "output"
        self.path_data = extract_data.PATH_DATA
        self.array_id = 0
        self.array_length = 30
        self.pilot = True  # Enable pilot mode for testing
        # self.model = "o4-mini"  # or "gpt-4o"
        self.model = "gpt-5"
        self.dry_run = False


mock_args = MockArgs()
print(f"Mock args: {vars(mock_args)}")


Mock args: {'output_dir': PosixPath('/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/output'), 'path_data': PosixPath('/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/data/intermediate/mr-pubmed-data/mr-pubmed-data-sample.json'), 'array_id': 0, 'array_length': 30, 'pilot': True, 'model': 'gpt-5', 'dry_run': False}


In [11]:
# Load config and abstracts
config, pubmed_data = get_config(args=mock_args)
print(f"Loaded {len(pubmed_data)} abstracts")
print(f"Config keys: {list(config.keys())}")


Loading data from /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/data/intermediate/mr-pubmed-data/mr-pubmed-data-sample.json
[chunking] pilot mode: startpoint=0, endpoint=min(5, 7000) = 5
Config: {'array_task_id': 0, 'openai_api_key': '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', 'num_docs': 5, 'path_to_pubmed': PosixPath('/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/data/intermediate/mr-pubmed-data/mr-pubmed-data-sample.json'), 'output_dir': PosixPath('/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/output/gpt-5'), 'out_file': PosixPath('/Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/output/gpt-5/mr_extract_openai_array_0_pilot.json'), 'model_config_name': 'gpt-5', 'model_config': {'model_id': 'gpt-5', 'chat_func': <function g

In [16]:
data_dir = PROJECT_ROOT / "data"

metadata_schema_file = data_dir / "assets" / "data-schema" / "processed_results" / "metadata.schema.json"
results_schema_file = data_dir / "assets" / "data-schema" / "processed_results" / "results.schema.json"

with metadata_schema_file.open("r") as f:
    metadata_schema = json.load(f)

with results_schema_file.open("r") as f:
    results_schema = json.load(f)


In [12]:
# Setup OpenAI client
client = setup_openai_client(api_key=config["openai_api_key"])
print("OpenAI client initialized")


Loaded OpenAI client
OpenAI client initialized


In [13]:
# Load schema data (examples + schemas)
schema_data = load_schema_data()
print("Schema data loaded")
print(f"Schema sections: {list(schema_data.keys())}")


All schema files found.
Schema data loaded
Schema sections: ['metadata', 'results']


In [None]:
# Setup for processing a single abstract
model_config = config["model_config"]
chat_func = model_config["chat_func"]


['pmid', 'ab', 'pub_date', 'title', 'journal_issn', 'journal', 'author_affil']


---
# Extraction of a single item

In [None]:
article_data = pubmed_data[0]
print(list(article_data.keys()))


## Prompts

In [15]:
# Build metadata prompt
from local_funcs import prompt_funcs

input_prompt_metadata = prompt_funcs.make_message_metadata_new(
    abstract=article_data["ab"],
    json_example=schema_data["metadata"]["example"],
    json_schema=schema_data["metadata"]["schema"],
)
pprint(input_prompt_metadata)


[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcohol us

In [16]:
# Build results prompt
input_prompt_results = prompt_funcs.make_message_results_new(
    abstract=article_data["ab"],
    json_example=schema_data["results"]["example"],
    json_schema=schema_data["results"]["schema"],
)
pprint(input_prompt_results)


[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcohol us

## Chat completion results

In [17]:
# Run chat completions and save output
completion_metadata = chat_func(client, input_prompt_metadata)
completion_results = chat_func(client, input_prompt_results)
result = {
    "completion_metadata": completion_metadata,
    "completion_results": completion_results,
}
output = dict(article_data, **result)


In [None]:
pprint(output)


{'ab': 'Alcohol consumption significantly impacts disease burden and has been '
       'linked to various diseases in observational studies. However, '
       'comprehensive meta-analyses using Mendelian randomization (MR) to '
       'examine drinking patterns are limited. We aimed to evaluate the health '
       'risks of alcohol use by integrating findings from MR studies. A '
       'thorough search was conducted for MR studies focused on alcohol '
       'exposure. We utilized two sets of instrumental variables-alcohol '
       'consumption and problematic alcohol use-and summary statistics from '
       'the FinnGen consortium R9 release to perform de novo MR analyses. Our '
       'meta-analysis encompassed 64 published and 151 de novo MR analyses '
       'across 76 distinct primary outcomes. Results show that a genetic '
       'predisposition to alcohol consumption, independent of smoking, '
       "significantly correlates with a decreased risk of Parkinson's disease, "
    

In [None]:
output_path = PROJECT_ROOT / "output" / "openai_output.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w") as f:
    json.dump(output, f, indent=2)
print(f"Wrote {output_path}")


Wrote /Users/ik18445/local-projects/+dmer/+mr-paper-data-extraction/llm-data-extraction/output/openai_output.json


## schema validation

In [19]:
input_path = PROJECT_ROOT / "output" / "openai_output.json"
with input_path.open("r") as f:
    output = json.load(f)

print(output.keys())


dict_keys(['pmid', 'ab', 'pub_date', 'title', 'journal_issn', 'journal', 'author_affil', 'completion_metadata', 'completion_results'])


In [35]:
parsed_output = parsers.parse_json(output["completion_metadata"])
assert parsed_output is not None
metadata_extraction = process_metadata(parsed_output)
pprint(metadata_extraction)

validate_item_with_schema(metadata_extraction, metadata_schema)


{'exposures': [{'category': 'behavioural',
                'id': '1',
                'trait': 'Alcohol consumption'},
               {'category': 'behavioural',
                'id': '2',
                'trait': 'Problematic alcohol use'}],
 'methods': ['two-sample mendelian randomization',
             'multivariable mendelian randomization',
             'Other: systematic review and meta-analysis'],
 'outcomes': [{'category': 'disease of the nervous system',
               'id': '1',
               'trait': "Parkinson's disease"},
              {'category': 'disease of the genitourinary system',
               'id': '2',
               'trait': 'Prostate hyperplasia (benign prostatic hyperplasia)'},
              {'category': 'disease of the musculoskeletal system and '
                           'connective tissue',
               'id': '3',
               'trait': 'Rheumatoid arthritis'},
              {'category': 'disease of the digestive system',
               'id': '4',
   

True

In [36]:
parsed_output = parsers.parse_json(output["completion_results"])
assert parsed_output is not None
results_extraction = process_results(parsed_output["results"])
pprint(results_extraction)

validate_item_with_schema(results_extraction, results_schema)


[{'95% CI': [None, None],
  'P-value': None,
  'SE': None,
  'beta': None,
  'direction': 'decreases',
  'exposure': 'Alcohol consumption',
  'hazard ratio': None,
  'odds ratio': None,
  'outcome': "Parkinson's disease",
  'units': None},
 {'95% CI': [None, None],
  'P-value': None,
  'SE': None,
  'beta': None,
  'direction': 'decreases',
  'exposure': 'Alcohol consumption',
  'hazard ratio': None,
  'odds ratio': None,
  'outcome': 'prostate hyperplasia',
  'units': None},
 {'95% CI': [None, None],
  'P-value': None,
  'SE': None,
  'beta': None,
  'direction': 'decreases',
  'exposure': 'Alcohol consumption',
  'hazard ratio': None,
  'odds ratio': None,
  'outcome': 'rheumatoid arthritis',
  'units': None},
 {'95% CI': [None, None],
  'P-value': None,
  'SE': None,
  'beta': None,
  'direction': 'increases',
  'exposure': 'Alcohol consumption',
  'hazard ratio': None,
  'odds ratio': None,
  'outcome': 'chronic pancreatitis',
  'units': None},
 {'95% CI': [None, None],
  'P-value'

True

# Extraction of a small batch

In [None]:
print(len(pubmed))


5


In [40]:
fulldata = []
for article_data in tqdm(pubmed_data):
    output = process_abstract(
        article_data=article_data,
        schema_data=schema_data,
        client=client,
        model_config=config["model_config"],
    )
    fulldata.append(output)

path_output = PROJECT_ROOT / "output" / "openai_batch.json"
with path_output.open("w") as _:
    json.dump(fulldata, _, indent=2)


100%|██████████| 5/5 [09:13<00:00, 110.79s/it]


## schema validation

In [41]:
input_path = PROJECT_ROOT / "output" / "openai_batch.json"
with input_path.open("r") as f:
    output = json.load(f)


In [43]:
for _ in tqdm(output):
    parsed_output = parsers.parse_json(_["completion_metadata"])
    assert parsed_output is not None
    metadata_extraction = process_metadata(parsed_output)

    validate_item_with_schema(metadata_extraction, metadata_schema)


    parsed_output = parsers.parse_json(_["completion_results"])
    assert parsed_output is not None
    results_extraction = process_results(parsed_output["results"])

    validate_item_with_schema(results_extraction, results_schema)


100%|██████████| 5/5 [00:00<00:00, 239.52it/s]
