In [None]:
import json
from pprint import pprint

import torch
import transformers
from environs import env
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig, AutoProcessor, Llama4ForConditionalGeneration

from local_funcs import chat_funcs, prompt_funcs
from yiutils.project_utils import find_project_root

proj_root = find_project_root("justfile")
data_dir = proj_root / "data"

print(transformers.__version__)
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

env.read_env(proj_root / ".env")
access_token = env("HUGGINGFACE_TOKEN")

path_to_mr_pubmed_data = (
    data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_mr_pubmed_data.exists(), (
    f"Data file {path_to_mr_pubmed_data} does not exist."
)

with open(path_to_mr_pubmed_data, "r") as f:
    mr_pubmed_data = json.load(f)

article_data = mr_pubmed_data[0]


  from .autonotebook import tqdm as notebook_tqdm


4.51.3
2.6.0
True
12.6


In [2]:
MODEL_ID = "deepseek-ai/DeepSeek-Prover-V2-7B"

device = "cuda"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map=device,
    token=access_token,
)

`rope_scaling`'s factor field must be a float >= 1, got 16
`rope_scaling`'s beta_fast field must be a float, got 32
`rope_scaling`'s beta_slow field must be a float, got 1
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.04s/it]


In [33]:
message_metadata = prompt_funcs.make_message_metadata(article_data["ab"])
message_results = prompt_funcs.make_message_results(article_data["ab"])

In [34]:
# messages = message_metadata
messages = message_results
input_ids = tokenizer.apply_chat_template(
    conversation=messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
print(input_ids.shape)
input_ids

torch.Size([1, 876])


tensor([[100000,   2054,    418,    245,   1191,  38200,   8872,    327,  46277,
          10356,   1757,    473,   3977,  10142,     13,   1257,   3510,   1319,
           2512,    366,    245,   2816,  11573,   2662,     13, 100006,    185,
            903,   1002,    317,    274,  12940,    473,    245,  36520,  27313,
           5236,   2035,   3437,     13,    185,   1457,    440,   2029,  12297,
          14934,   9812,  26468,   8666,  18953,    285,    643,    803,  12115,
            276,   3947,  18969,    279,  38795,   7535,     13,   3159,     11,
          13862,  17147,     12,  36372,  84810,   1244,  36520,  27313,   5236,
           2035,    334,  15285,      8,    276,  16095,  16306,  11817,    418,
           6415,     13,   1003,  21807,    276,  13782,    254,   2802,  16418,
            280,  14667,    938,    457,  33104,  18107,    473,  26455,   7535,
             13,    338,  11323,   3365,    438,  13185,    327,  26455,   7535,
          11554,    331,  14

In [35]:
terminators = [
    tokenizer.eos_token_id,
    # tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
outputs = model.generate(
    input_ids,
    max_new_tokens=2048,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.1,
    # top_p=0.15,
)
print(outputs.shape)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


torch.Size([1, 2374])


tensor([[100000,   2054,    418,  ...,    185,  10897, 100001]],
       device='cuda:0')

In [36]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'You are a data scientist responsible for extracting accurate information from research papers. You answer each question with a single JSON string.<｜User｜>\n                This is an abstract from a Mendelian randomization study.\n                    "Alcohol consumption significantly impacts disease burden and has been linked to various diseases in observational studies. However, comprehensive meta-analyses using Mendelian randomization (MR) to examine drinking patterns are limited. We aimed to evaluate the health risks of alcohol use by integrating findings from MR studies. A thorough search was conducted for MR studies focused on alcohol exposure. We utilized two sets of instrumental variables-alcohol consumption and problematic alcohol use-and summary statistics from the FinnGen consortium R9 release to perform de novo MR analyses. Our meta-analysis encompassed 64 published and 151 de novo MR analyses across 76 distinct primary outcomes. Results show that a genetic predisposition 

In [37]:
response = outputs[0][input_ids.shape[-1] :]
result = tokenizer.decode(response, skip_special_tokens=True)
print(result)

### Step 1: Extract the Results from the Abstract

First, we need to extract all the relevant information from the abstract. The abstract provides a detailed description of a Mendelian randomization study, including the results from a meta-analysis of MR studies. We are tasked with extracting the results in a specific format.

### Step 2: Identify the Results

The abstract mentions several results, such as:
- Alcohol consumption significantly impacts disease burden.
- A genetic predisposition to alcohol consumption is associated with decreased risks of Parkinson's disease, prostate hyperplasia, and rheumatoid arthritis, and increased risks of chronic pancreatitis, colorectal cancer, and head and neck cancers.
- A genetic predisposition to problematic alcohol use is associated with increased risks of alcoholic liver disease, cirrhosis, chronic pancreatitis, and pneumonia.

### Step 3: Extract the Key Information

For each result, we need to extract the following information:
1. Exposure