In [1]:
!nvidia-smi

Sat May  3 16:42:43 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GH200 120GB             On  | 00000009:01:00.0 Off |                    0 |
| N/A   25C    P0              92W / 900W |     23MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
import json
from pprint import pprint

import torch
from environs import env
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

from local_funcs import chat_funcs, prompt_funcs
from yiutils.project_utils import find_project_root

In [6]:
proj_root = find_project_root("justfile")
data_dir = proj_root / "data"

env.read_env(proj_root / ".env")

True

In [20]:
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.6


# data load

In [None]:
path_to_mr_pubmed_data = (
    data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_mr_pubmed_data.exists(), (
    f"Data file {path_to_mr_pubmed_data} does not exist."
)

with open(path_to_mr_pubmed_data, "r") as f:
    mr_pubmed_data = json.load(f)

In [18]:
article_data = mr_pubmed_data[0]
article_data

{'pmid': '38794754',
 'ab': "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo MR analyses across 76 distinct primary outcomes. Results show that a genetic predisposition to alcohol consumption, independent of smoking, significantly correlates with a decreased risk of Parkinson's disease, prostate hyperplasia, and rheumatoid arthritis. It was also associated with an increased risk of chronic 

# model

In [11]:
access_token = env("HUGGINGFACE_TOKEN")
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
device = "cuda"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
quantization_config = QuantoConfig(weights="int4")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=dtype,
    device_map=device,
    token=access_token,
    quantization_config=quantization_config,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


# Inferecnce

In [None]:
message_metadata = prompt_funcs.make_message_metadata(article_data["ab"])
pprint(message_metadata)

[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

In [None]:
message_results = prompt_funcs.make_message_results(article_data["ab"])
pprint(message_results)

[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

In [19]:
completion_metadata = chat_funcs.extract(message_metadata, tokenizer, model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [21]:
completion_metadata

'{\n    "exposures": [\n        {\n            "id": "1",\n            "trait": "Alcohol consumption",\n            "category": "behavioural"\n        },\n        {\n            "id": "2",\n            "trait": "Problematic alcohol use",\n            "category": "behavioural"\n        }\n    ],\n    "outcomes": [\n        {\n            "id": "1",\n            "trait": "Parkinson\'s disease",\n            "category": "disease of the nervous system"\n        },\n        {\n            "id": "2",\n            "trait": "Prostate hyperplasia",\n            "category": "disease of the genitourinary system"\n        },\n        {\n            "id": "3",\n            "trait": "Rheumatoid arthritis",\n            "category": "disease of the musculoskeletal system and connective tissue"\n        },\n        {\n            "id": "4",\n            "trait": "Chronic pancreatitis",\n            "category": "disease of the digestive system"\n        },\n        {\n            "id": "5",\n           

In [22]:
completion_results = chat_funcs.extract(message_results, tokenizer, model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [23]:
result_metadata = chat_funcs.clean_result(completion_metadata)
result_results = chat_funcs.clean_result(completion_results)

In [25]:
result_metadata

{'exposures': [{'id': '1',
   'trait': 'Alcohol consumption',
   'category': 'behavioural'},
  {'id': '2', 'trait': 'Problematic alcohol use', 'category': 'behavioural'}],
 'outcomes': [{'id': '1',
   'trait': "Parkinson's disease",
   'category': 'disease of the nervous system'},
  {'id': '2',
   'trait': 'Prostate hyperplasia',
   'category': 'disease of the genitourinary system'},
  {'id': '3',
   'trait': 'Rheumatoid arthritis',
   'category': 'disease of the musculoskeletal system and connective tissue'},
  {'id': '4',
   'trait': 'Chronic pancreatitis',
   'category': 'disease of the digestive system'},
  {'id': '5', 'trait': 'Colorectal cancer', 'category': 'neoplasm'},
  {'id': '6', 'trait': 'Head and neck cancers', 'category': 'neoplasm'},
  {'id': '7',
   'trait': 'Alcoholic liver disease',
   'category': 'disease of the liver'},
  {'id': '8', 'trait': 'Cirrhosis', 'category': 'disease of the liver'},
  {'id': '9',
   'trait': 'Acute pancreatitis',
   'category': 'disease of 

In [26]:
result_results

{'results': [{'exposure': 'alcohol consumption',
   'outcome': "Parkinson's disease",
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'prostate hyperplasia',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'rheumatoid arthritis',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'chronic pancreatitis',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'increases'},
  {'exposure': 'al