In [2]:
%load_ext autoreload
%autoreload 2
import json
from pprint import pprint

from environs import env
from openai import OpenAI
from tqdm import tqdm

from local_funcs import prompt_templates, openai_funcs
from yiutils.project_utils import find_project_root


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
proj_root = find_project_root(anchor_file="justfile")

path_to_env = proj_root / ".env"
assert path_to_env.exists(), f"File not found: {path_to_env}"

env.read_env(path_to_env)
openai_api_key = env("OPENAI_API_KEY")

path_to_data_dir = proj_root / "data"
assert path_to_data_dir.exists()
path_to_data = (
    path_to_data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_data.exists(), f"File not found: {path_to_data}"

output_dir = proj_root / "output"
assert output_dir.exists(), f"Output directory not found: {output_dir}"


In [4]:
with path_to_data.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(data[0])


{'pmid': '38794754', 'ab': "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo MR analyses across 76 distinct primary outcomes. Results show that a genetic predisposition to alcohol consumption, independent of smoking, significantly correlates with a decreased risk of Parkinson's disease, prostate hyperplasia, and rheumatoid arthritis. It was also associated with an increased risk of chronic p

In [5]:
client = OpenAI(api_key=openai_api_key)


# model snippet


In [10]:
prompt = """
Write a bash script that takes a matrix represented as a string with 
format '[1,2],[3,4],[5,6]' and prints the transpose in the same format.
"""

response = client.chat.completions.create(
    model="o3-mini",
    reasoning_effort="low",
    messages=[{"role": "user", "content": prompt}],
)

print(response.choices[0].message.content)


Below is one solution in bash. Save the script (for example as transpose.sh), make it executable (chmod +x transpose.sh), and pass the matrix string as the first argument.

Note that this script assumes that the input matrix is rectangular and that rows are formatted as in the example: each row is enclosed in square brackets and rows are comma‐separated, e.g.,

  [1,2],[3,4],[5,6]

Here is the script:

------------------------------------------------
#!/bin/bash
# Usage: ./transpose.sh "[1,2],[3,4],[5,6]"
#
# This script takes a matrix string in the format "[1,2],[3,4],[5,6]"
# and prints its transpose in the same format.

if [ "$#" -ne 1 ]; then
    echo "Usage: $0 \"[1,2],[3,4],[5,6]\""
    exit 1
fi

# Get the matrix string from the first argument.
matrix="$1"

# Remove any whitespace
matrix=$(echo "$matrix" | tr -d '[:space:]')

# The input is a list of rows like: [1,2],[3,4],[5,6]
# We can split on "],"
# First, remove any leading '[' and trailing ']' so that split is easier.
matr

In [11]:
response


ChatCompletion(id='chatcmpl-BhIOaceqpHrmVbN8cRPhi1VdrUY4b', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Below is one solution in bash. Save the script (for example as transpose.sh), make it executable (chmod +x transpose.sh), and pass the matrix string as the first argument.\n\nNote that this script assumes that the input matrix is rectangular and that rows are formatted as in the example: each row is enclosed in square brackets and rows are comma‐separated, e.g.,\n\n\u2003\u2003[1,2],[3,4],[5,6]\n\nHere is the script:\n\n------------------------------------------------\n#!/bin/bash\n# Usage: ./transpose.sh "[1,2],[3,4],[5,6]"\n#\n# This script takes a matrix string in the format "[1,2],[3,4],[5,6]"\n# and prints its transpose in the same format.\n\nif [ "$#" -ne 1 ]; then\n    echo "Usage: $0 \\"[1,2],[3,4],[5,6]\\""\n    exit 1\nfi\n\n# Get the matrix string from the first argument.\nmatrix="$1"\n\n# Remove any whitespace\nmatr

In [12]:
len(response.choices)


1

In [13]:
response.choices[0].message


ChatCompletionMessage(content='Below is one solution in bash. Save the script (for example as transpose.sh), make it executable (chmod +x transpose.sh), and pass the matrix string as the first argument.\n\nNote that this script assumes that the input matrix is rectangular and that rows are formatted as in the example: each row is enclosed in square brackets and rows are comma‐separated, e.g.,\n\n\u2003\u2003[1,2],[3,4],[5,6]\n\nHere is the script:\n\n------------------------------------------------\n#!/bin/bash\n# Usage: ./transpose.sh "[1,2],[3,4],[5,6]"\n#\n# This script takes a matrix string in the format "[1,2],[3,4],[5,6]"\n# and prints its transpose in the same format.\n\nif [ "$#" -ne 1 ]; then\n    echo "Usage: $0 \\"[1,2],[3,4],[5,6]\\""\n    exit 1\nfi\n\n# Get the matrix string from the first argument.\nmatrix="$1"\n\n# Remove any whitespace\nmatrix=$(echo "$matrix" | tr -d \'[:space:]\')\n\n# The input is a list of rows like: [1,2],[3,4],[5,6]\n# We can split on "],"\n# Fir

# setting up

In [None]:
def generate_message(abstract):
    messages = [
        {
            "role": "system",
            "content": "You are a data scientist responsible for extracting accurate information from research papers. You answer each question with a single JSON string.",
        },
        {
            "role": "user",
            "content": f"""
                This is an abstract from a Mendelian randomization study.
                    "{abstract["ab"]}"   """,
        },
        prompt_templates.metadataexample,
        prompt_templates.metadataprompt,
    ]
    return messages


pprint(generate_message(data[0]))


[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

# Single case

## o4-mini


In [33]:
# response = client.chat.completions.create(
#     model="o4-mini",
#     reasoning_effort="medium",
#     messages=generate_message(data[0]),
# )

response = client.responses.create(
    model="o4-mini",
    input=generate_message(data[0]),
    reasoning={"effort": "medium"},
)


### response check

In [34]:
dir(response)


['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_computed_fields__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_setattr_handl

In [35]:
print(response.output_text)


{"metadata":{"exposures":[{"id":"1","trait":"Alcohol consumption","category":"behavioural"},{"id":"2","trait":"Problematic alcohol use","category":"behavioural"}],"outcomes":[{"id":"1","trait":"Parkinson's disease","category":"disease of the nervous system"},{"id":"2","trait":"Prostate hyperplasia","category":"disease of the genitourinary system"},{"id":"3","trait":"Rheumatoid arthritis","category":"disease of the musculoskeletal system and connective tissue"},{"id":"4","trait":"Chronic pancreatitis","category":"disease of the digestive system"},{"id":"5","trait":"Colorectal cancer","category":"neoplasm"},{"id":"6","trait":"Head and neck cancers","category":"neoplasm"},{"id":"7","trait":"Alcoholic liver disease","category":"disease of the digestive system"},{"id":"8","trait":"Cirrhosis","category":"disease of the digestive system"},{"id":"9","trait":"Acute pancreatitis","category":"disease of the digestive system"},{"id":"10","trait":"Pneumonia","category":"infectious disease"}],"metho

In [36]:
print(response.reasoning)


Reasoning(effort='medium', generate_summary=None, summary=None)


## gpt-4o

In [None]:
response = client.responses.create(
    model="gpt-4o",
    input=generate_message(data[0]),
)


In [38]:
print(response.output_text)


{
  "metadata": {
    "exposures": [
      {
        "id": "1",
        "trait": "Alcohol consumption",
        "category": "behavioural"
      },
      {
        "id": "2",
        "trait": "Problematic alcohol use",
        "category": "behavioural"
      }
    ],
    "outcomes": [
      {
        "id": "1",
        "trait": "Parkinson's disease",
        "category": "disease of the nervous system"
      },
      {
        "id": "2",
        "trait": "Prostate hyperplasia",
        "category": "disease of the genitourinary system"
      },
      {
        "id": "3",
        "trait": "Rheumatoid arthritis",
        "category": "disease of the musculoskeletal system and connective tissue"
      },
      {
        "id": "4",
        "trait": "Chronic pancreatitis",
        "category": "disease of the digestive system"
      },
      {
        "id": "5",
        "trait": "Colorectal cancer",
        "category": "neoplasm"
      },
      {
        "id": "6",
        "trait": "Head and nec

# 10-doc batch

In [6]:
print(len(data))

data_batch = data[:10]


15635


In [7]:
result_batch_o4_mini = [
    openai_funcs.get_o4_mini_result(
        client=client,
        abstract=abstract,
    )
    for abstract in tqdm(data_batch, desc="Processing batch")
]


Processing batch: 100%|██████████| 10/10 [02:40<00:00, 16.09s/it]


In [8]:
result_gpt_4o = [
    openai_funcs.get_gpt_4o_result(
        client=client,
        abstract=abstract,
    )
    for abstract in tqdm(data_batch, desc="Processing batch")
]


Processing batch: 100%|██████████| 10/10 [00:43<00:00,  4.33s/it]


In [9]:
print(result_batch_o4_mini[0])


{
  "metadata": {
    "exposures": [
      {
        "id": "1",
        "trait": "Alcohol consumption",
        "category": "behavioural"
      },
      {
        "id": "2",
        "trait": "Problematic alcohol use",
        "category": "behavioural"
      }
    ],
    "outcomes": [
      {
        "id": "1",
        "trait": "Parkinson's disease",
        "category": "disease of the nervous system"
      },
      {
        "id": "2",
        "trait": "Prostate hyperplasia",
        "category": "disease of the genitourinary system"
      },
      {
        "id": "3",
        "trait": "Rheumatoid arthritis",
        "category": "disease of the musculoskeletal system and connective tissue"
      },
      {
        "id": "4",
        "trait": "Chronic pancreatitis",
        "category": "disease of the digestive system"
      },
      {
        "id": "5",
        "trait": "Colorectal cancer",
        "category": "neoplasm"
      },
      {
        "id": "6",
        "trait": "Head and nec

In [None]:
output_path = output_dir / "sample_results_o4_mini.json"
with output_path.open("w", encoding="utf-8") as f:
    json.dump(result_batch_o4_mini, f, indent=2, ensure_ascii=False)
output_path = output_dir / "sample_results_gpt_4o.json"
with output_path.open("w", encoding="utf-8") as f:
    json.dump(result_gpt_4o, f, indent=2, ensure_ascii=False)
