In [1]:
import json
from pprint import pprint

from environs import env
from local_funcs import prompts
from yiutils.project_utils import find_project_root

In [3]:
proj_root = find_project_root()

path_to_env = proj_root / ".env"
assert path_to_env.exists(), f"File not found: {path_to_env}"

env.read_env(path_to_env)
openai_api_key = env("OPENAI_API_KEY")

path_to_data_dir = proj_root / "data"
assert path_to_data_dir.exists(), f"Directory not found: {path_to_data_dir}"
path_to_data = path_to_data_dir / "mr-pubmed-abstracts" / "data" / "pubmed.json"
assert path_to_data.exists(), f"File not found: {path_to_data}"

In [4]:
from openai import OpenAI

client = OpenAI(api_key=openai_api_key)

In [5]:
with path_to_data.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(data[0])

{'pmid': '38794754', 'ab': "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo MR analyses across 76 distinct primary outcomes. Results show that a genetic predisposition to alcohol consumption, independent of smoking, significantly correlates with a decreased risk of Parkinson's disease, prostate hyperplasia, and rheumatoid arthritis. It was also associated with an increased risk of chronic p

# reasoning model snippet

In [16]:
prompt = """
Write a bash script that takes a matrix represented as a string with 
format '[1,2],[3,4],[5,6]' and prints the transpose in the same format.
"""

response = client.chat.completions.create(
    model="o3-mini",
    reasoning_effort="low",
    messages=[
        {
            "role": "user", 
            "content": prompt
        }
    ]
)

print(response.choices[0].message.content)

Below is one solution in Bash. Save it as, for example, transpose.sh, make it executable (chmod +x transpose.sh), and then run it with the matrix string as an argument.

──────────────────────────────
#!/bin/bash
# This script expects a single argument containing the matrix,
# e.g.: '[1,2],[3,4],[5,6]'
#
# It prints the transpose of the matrix in the same format.

if [ "$#" -ne 1 ]; then
  echo "Usage: $0 \"[1,2],[3,4],[5,6]\""
  exit 1
fi

input="$1"

# Replace "],[" with a newline, and remove leading/trailing brackets.
# This produces one row per line.
rows=$(echo "$input" | sed 's/],[/\n/g; s/^\[//; s/\]$//')

# Read rows into an array.
mapfile -t rowLines <<< "$rows"

# Parse the rows into a 2D array (stored in an associative array with keys "row,col").
declare -A matrixArr
numRows=${#rowLines[@]}
numCols=0

for i in "${!rowLines[@]}"; do
  # Split the row by commas.
  IFS=',' read -ra nums <<< "${rowLines[$i]}"
  # Save the number of columns from the first row.
  if [ "$i" -eq 0 ]

In [12]:
response

ChatCompletion(id='chatcmpl-BKAEVIezj1AFaU01redu09uPuXl8a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='#!/bin/bash\n# This script accepts a matrix represented as a string in the format:\n#   [1,2],[3,4],[5,6]\n# and prints its transpose in the same format.\n#\n# Example:\n#   Input:  [1,2],[3,4],[5,6]\n#   Output: [1,3,5],[2,4,6]\n#\n# Usage:\n#   ./transpose.sh "[1,2],[3,4],[5,6]"\n\n# Check if an argument was provided\nif [ "$#" -ne 1 ]; then\n  echo "Usage: $0 \\"[1,2],[3,4],[5,6]\\""\n  exit 1\nfi\n\ninput="$1"\n\n# Preprocess input:\n#   - Remove the starting \'[\' and trailing \']\' from the whole string.\n#   - Replace "],[" with a newline so that each row becomes a separate line.\nrows=$(echo "$input" | sed \'s/^\\[//; s/\\]$//; s/\\],\\[/\\n/g\')\n\n# We\'ll store the matrix values in an associative array where the key is "row,column"\ndeclare -A matrix\nnrows=0\nncols=0\n\n# Process each row.\nwhile IFS= read -r line; 

In [14]:
len(response.choices)

1

In [15]:
response.choices[0].message

ChatCompletionMessage(content='#!/bin/bash\n# This script accepts a matrix represented as a string in the format:\n#   [1,2],[3,4],[5,6]\n# and prints its transpose in the same format.\n#\n# Example:\n#   Input:  [1,2],[3,4],[5,6]\n#   Output: [1,3,5],[2,4,6]\n#\n# Usage:\n#   ./transpose.sh "[1,2],[3,4],[5,6]"\n\n# Check if an argument was provided\nif [ "$#" -ne 1 ]; then\n  echo "Usage: $0 \\"[1,2],[3,4],[5,6]\\""\n  exit 1\nfi\n\ninput="$1"\n\n# Preprocess input:\n#   - Remove the starting \'[\' and trailing \']\' from the whole string.\n#   - Replace "],[" with a newline so that each row becomes a separate line.\nrows=$(echo "$input" | sed \'s/^\\[//; s/\\]$//; s/\\],\\[/\\n/g\')\n\n# We\'ll store the matrix values in an associative array where the key is "row,column"\ndeclare -A matrix\nnrows=0\nncols=0\n\n# Process each row.\nwhile IFS= read -r line; do\n  # Split the row elements by comma.\n  IFS=\',\' read -ra values <<< "$line"\n  # Set ncols based on the first row read.\n  i

# reasoning model try

In [13]:
def generate_message(abstract):
    messages = [
        {
            "role": "system",
            "content": "You are a data scientist responsible for extracting accurate information from research papers. You answer each question with a single JSON string.",
        },
        {
            "role": "user",
            "content": f"""
                This is an abstract from a Mendelian randomization study.
                    "{abstract["ab"]}"   """,
        },
        prompts.metadataexample,
        prompts.metadataprompt,
    ]
    return messages

pprint(generate_message(data[0]))

[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

In [7]:
response = client.chat.completions.create(
    model="o3-mini",
    reasoning_effort="low",
    messages=generate_message(data[0]),
)

In [10]:
dir(response.choices[0].message)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_computed_fields__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_setattr_handl

In [8]:
print(response.choices[0].message.content)

{
  "metadata": {
    "exposures": [
      {
        "id": "1",
        "trait": "Alcohol consumption",
        "category": "behavioural"
      },
      {
        "id": "2",
        "trait": "Problematic alcohol use",
        "category": "behavioural"
      }
    ],
    "outcomes": [
      {
        "id": "1",
        "trait": "Parkinson's disease",
        "category": "disease of the nervous system"
      },
      {
        "id": "2",
        "trait": "Prostate hyperplasia",
        "category": "disease of the genitourinary system"
      },
      {
        "id": "3",
        "trait": "Rheumatoid arthritis",
        "category": "disease of the musculoskeletal system and connective tissue"
      },
      {
        "id": "4",
        "trait": "Chronic pancreatitis",
        "category": "disease of the digestive system"
      },
      {
        "id": "5",
        "trait": "Colorectal cancer",
        "category": "neoplasm"
      },
      {
        "id": "6",
        "trait": "Head and nec