In [1]:
!nvidia-smi

Sun May  4 14:57:18 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GH200 120GB             On  | 00000029:01:00.0 Off |                    0 |
| N/A   27C    P0              90W / 900W |     35MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import json
from pprint import pprint

import torch
from environs import env
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

from local_funcs import chat_funcs, prompt_funcs
from yiutils.project_utils import find_project_root

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
proj_root = find_project_root("justfile")
data_dir = proj_root / "data"

env.read_env(proj_root / ".env")

True

In [4]:
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.6


# data load

In [5]:
path_to_mr_pubmed_data = (
    data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_mr_pubmed_data.exists(), (
    f"Data file {path_to_mr_pubmed_data} does not exist."
)

with open(path_to_mr_pubmed_data, "r") as f:
    mr_pubmed_data = json.load(f)

In [6]:
article_data = mr_pubmed_data[0]
article_data

{'pmid': '38794754',
 'ab': "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo MR analyses across 76 distinct primary outcomes. Results show that a genetic predisposition to alcohol consumption, independent of smoking, significantly correlates with a decreased risk of Parkinson's disease, prostate hyperplasia, and rheumatoid arthritis. It was also associated with an increased risk of chronic 

# model

In [7]:
access_token = env("HUGGINGFACE_TOKEN")
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
device = "cuda"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
quantization_config = QuantoConfig(weights="int4")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=dtype,
    device_map=device,
    token=access_token,
    quantization_config=quantization_config,
)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.12s/it]


# Inferecnce abstraction

## messages

In [8]:
message_metadata = prompt_funcs.make_message_metadata(article_data["ab"])
pprint(message_metadata)

[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

In [9]:
message_results = prompt_funcs.make_message_results(article_data["ab"])
pprint(message_results)

[{'content': 'You are a data scientist responsible for extracting accurate '
             'information from research papers. You answer each question with '
             'a single JSON string.',
  'role': 'system'},
 {'content': '\n'
             '                This is an abstract from a Mendelian '
             'randomization study.\n'
             '                    "Alcohol consumption significantly impacts '
             'disease burden and has been linked to various diseases in '
             'observational studies. However, comprehensive meta-analyses '
             'using Mendelian randomization (MR) to examine drinking patterns '
             'are limited. We aimed to evaluate the health risks of alcohol '
             'use by integrating findings from MR studies. A thorough search '
             'was conducted for MR studies focused on alcohol exposure. We '
             'utilized two sets of instrumental variables-alcohol consumption '
             'and problematic alcoho

## completions

In [10]:
completion_metadata = chat_funcs.extract(message_metadata, tokenizer, model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [11]:
completion_metadata

'{\n    "exposures": [\n        {\n            "id": "1",\n            "trait": "Alcohol consumption",\n            "category": "behavioural"\n        },\n        {\n            "id": "2",\n            "trait": "Problematic alcohol use",\n            "category": "behavioural"\n        }\n    ],\n    "outcomes": [\n        {\n            "id": "1",\n            "trait": "Parkinson\'s disease",\n            "category": "disease of the nervous system"\n        },\n        {\n            "id": "2",\n            "trait": "Prostate hyperplasia",\n            "category": "disease of the genitourinary system"\n        },\n        {\n            "id": "3",\n            "trait": "Rheumatoid arthritis",\n            "category": "disease of the musculoskeletal system and connective tissue"\n        },\n        {\n            "id": "4",\n            "trait": "Chronic pancreatitis",\n            "category": "disease of the digestive system"\n        },\n        {\n            "id": "5",\n           

In [12]:
completion_results = chat_funcs.extract(message_results, tokenizer, model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


## cleaned results

In [13]:
result_metadata = chat_funcs.clean_result(completion_metadata)
result_results = chat_funcs.clean_result(completion_results)

In [14]:
result_metadata

{'exposures': [{'id': '1',
   'trait': 'Alcohol consumption',
   'category': 'behavioural'},
  {'id': '2', 'trait': 'Problematic alcohol use', 'category': 'behavioural'}],
 'outcomes': [{'id': '1',
   'trait': "Parkinson's disease",
   'category': 'disease of the nervous system'},
  {'id': '2',
   'trait': 'Prostate hyperplasia',
   'category': 'disease of the genitourinary system'},
  {'id': '3',
   'trait': 'Rheumatoid arthritis',
   'category': 'disease of the musculoskeletal system and connective tissue'},
  {'id': '4',
   'trait': 'Chronic pancreatitis',
   'category': 'disease of the digestive system'},
  {'id': '5', 'trait': 'Colorectal cancer', 'category': 'neoplasm'},
  {'id': '6', 'trait': 'Head and neck cancers', 'category': 'neoplasm'},
  {'id': '7',
   'trait': 'Alcoholic liver disease',
   'category': 'disease of the digestive system'},
  {'id': '8',
   'trait': 'Cirrhosis',
   'category': 'disease of the digestive system'},
  {'id': '9',
   'trait': 'Acute pancreatitis',

In [15]:
result_results

{'results': [{'exposure': 'alcohol consumption',
   'outcome': "Parkinson's disease",
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'prostate hyperplasia',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'rheumatoid arthritis',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'decreases'},
  {'exposure': 'alcohol consumption',
   'outcome': 'chronic pancreatitis',
   'beta': None,
   'units': None,
   'odds ratio': None,
   'hazard ratio': None,
   '95% CI': None,
   'SE': None,
   'P-value': None,
   'direction': 'increases'},
  {'exposure': 'al

# Insights

## model architecture

In [16]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (v_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (o_proj): QLinear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (up_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (down_proj): QLinear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (

In [17]:
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<

## model output

In [19]:
messages = message_metadata
input_ids = tokenizer.apply_chat_template(
    conversation=messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
print(input_ids.shape)
input_ids

torch.Size([1, 1056])


tensor([[128000, 128006,   9125,  ...,  78191, 128007,    271]],
       device='cuda:0')

In [20]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
outputs = model.generate(
    input_ids,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.1,
    # top_p=0.15,
)
print(outputs.shape)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


torch.Size([1, 1525])


tensor([[128000, 128006,   9125,  ...,   4260,     92, 128009]],
       device='cuda:0')

In [24]:
response = outputs[0][input_ids.shape[-1] :]
print(outputs[0].shape)
print(input_ids.shape[-1])
print(response.shape)
print(response)

torch.Size([1525])
1056
torch.Size([469])
tensor([   517,    262,    330,    327,    981,   1439,    794,   2330,    286,
           341,    310,    330,    307,    794,    330,     16,    761,    310,
           330,  30532,    794,    330,   2149,  11353,  15652,    761,    310,
           330,   5588,    794,    330,  30998,   9073,    278,    702,    286,
          1173,    286,    341,    310,    330,    307,    794,    330,     17,
           761,    310,    330,  30532,    794,    330,  32298,    780,  13200,
          1005,    761,    310,    330,   5588,    794,    330,  30998,   9073,
           278,    702,    286,    457,    262,   3291,    262,    330,    412,
          6716,    794,   2330,    286,    341,    310,    330,    307,    794,
           330,     16,    761,    310,    330,  30532,    794,    330,  64706,
         29973,    596,   8624,    761,    310,    330,   5588,    794,    330,
            67,  56407,    315,    279,  23418,   1887,    702,    286,   1173

In [25]:
res = tokenizer.decode(response, skip_special_tokens=True)
print(res)

{
    "exposures": [
        {
            "id": "1",
            "trait": "Alcohol consumption",
            "category": "behavioural"
        },
        {
            "id": "2",
            "trait": "Problematic alcohol use",
            "category": "behavioural"
        }
    ],
    "outcomes": [
        {
            "id": "1",
            "trait": "Parkinson's disease",
            "category": "disease of the nervous system"
        },
        {
            "id": "2",
            "trait": "Prostate hyperplasia",
            "category": "disease of the genitourinary system"
        },
        {
            "id": "3",
            "trait": "Rheumatoid arthritis",
            "category": "disease of the musculoskeletal system and connective tissue"
        },
        {
            "id": "4",
            "trait": "Chronic pancreatitis",
            "category": "disease of the digestive system"
        },
        {
            "id": "5",
            "trait": "Colorectal cancer",
       

In [27]:
tokenizer.decode(outputs[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a data scientist responsible for extracting accurate information from research papers. You answer each question with a single JSON string.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThis is an abstract from a Mendelian randomization study.\n                    "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo M