In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline



import glob
import torch
import json
import os
from tqdm import tqdm
from datasets import load_dataset
import transformers
from vllm import LLM, SamplingParams
import argparse

In [None]:
"""

export CUDA_VISIBLE_DEVICES=7
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
model_id="meta-llama/Meta-Llama-3.1-70B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, device_map="auto")

"""

In [2]:

model = None


def gen_problem(record):
    comp_name = record["CompName"]
    smiles = record["SMILES"]
    unit = record["unit"]
    property_name = record["Property"]
    q = f"""Predict the {property_name} {unit} of the following compound. 
    #Restriction: Output can contain "#Reason" section which explains the reasoning behind the prediction. Output must contain "#Prediction" section which contains only the predicted value (number only).
    #Name: {comp_name}
    #SMILES: {smiles}"""
    actual_value = record["Value"]

    # chat = [
    #    {"role": "user", "content": q},
    # ]

    # prompt = tokenizer.apply_chat_template(chat, tokenize=False,)
    # assist_prefix = "assistant\n\n"
    # prompt += assist_prefix

    return q, actual_value

# %%


def extract_answer_from_text(text):
    target_line = text.split("#Prediction")[-1].strip()
    if target_line.find("\n"):
        target_line = target_line.split("\n")[0].strip()
    noise = [" ", "\t", "#"]
    for n in noise:
        target_line = target_line.replace(n, "")
    return target_line


def llm_gen(pipe, q_list):
    a_list = []
    for q in tqdm(q_list):
        messages = [
            {"role": "user", "content": q},
        ]
        prompt = pipe.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)

        outputs = pipe(prompt, max_new_tokens=1024,
                       temperature=0.0, do_sample=False)
        out_text = outputs[0]["generated_text"].lstrip(prompt)
        a_list.append(out_text)
    return a_list


# get model dir
#model_dir_list = glob.glob(f"{args.checkpoint_dir}/checkpoint-*")
checkpoint_dir="/data/hatakeyama/self-loop/0924split_train_test/sftlab/experiments/lora_llama_all/1/output3/sftlab-experiments/lora_llama_all/1-llama3_1_8b_4_lora_full-zero1"
model_dir_list = glob.glob(f"{checkpoint_dir}/checkpoint-*")


##if len(model_dir_list) == 0 and model_id != "":
#    model_dir_list = [model_id]
mode="test"

print("model_dir_list", model_dir_list)


for model_id in model_dir_list:
    del model
    torch.cuda.empty_cache()
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id, trust_remote_code=True, device_map="auto")
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
    pipe = transformers.pipeline(
        'text-generation', model=model, tokenizer=tokenizer)

    # %%

    ds = load_dataset("kanhatakeyama/material-properties", split="train")
    ds = ds.shuffle(seed=1)

    # %%

    # %%
    predictions = []
    batch_size = 10
    train_ds = ds.select(range(7000))
    test_ds = ds.select(range(7000, 7200))

    if mode == "test":
        target_ds = test_ds
    else:
        target_ds = train_ds

    for i in tqdm(range(0, len(target_ds), batch_size)):
        max_i = min(i+batch_size, len(target_ds))
        batch = target_ds.select(range(i, max_i))
        prompts = []
        for record in batch:
            prompt, actual_value = gen_problem(record)
            prompts.append(prompt)

        outputs = llm_gen(pipe, prompts)
        for record, output in zip(batch, outputs):
            predicted_value = extract_answer_from_text(output)
            record["predicted_value"] = predicted_value
            record["predicted_text"] = output
            predictions.append(record)


        break

    # %%
    os.makedirs(out_path, exist_ok=True)
    save_path = f"{out_path}/{model_id.replace('/','_')}_{mode}.json"
    with open(save_path, "w") as f:
        json.dump(predictions, f, indent=2)


model_dir_list ['/data/hatakeyama/self-loop/0924split_train_test/sftlab/experiments/lora_llama_all/1/output3/sftlab-experiments/lora_llama_all/1-llama3_1_8b_4_lora_full-zero1/checkpoint-97']


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:47<00:00,  4.78s/it]
  0%|          | 0/20 [00:47<?, ?it/s]


NameError: name 'out_path' is not defined

In [3]:
predictions

[{'CompName': 'unknown',
  'SMILES': 'CCC',
  'Property': 'Ionization Energy',
  'Value': 11.07,
  'unit': '[eV]',
  'Source': 'Wikipedia/Wikidata',
  'predicted_value': '10.63',
  'predicted_text': '10.63 eV. This value represents the energy required to remove an electron from the compound in its gaseous state. The Ionization Energy is an important property in understanding the chemical reactivity and electronic structure of a molecule.\n\n#Prediction\n10.63'},
 {'CompName': 'unknown',
  'SMILES': 'CC(C)O[N+](=O)[O-]',
  'Property': 'Melting temperature',
  'Value': -82.5,
  'unit': '[oC]',
  'Source': 'Wikipedia/Wikidata',
  'predicted_value': '12.77777778',
  'predicted_text': "12.77777778\n\n#Reason\nThe compound's SMILES notation CC(C)O[N+](=O)[O-] indicates the presence of a nitro group, which typically has a relatively low melting point due to its polar nature. However, the exact melting temperature of this compound cannot be determined without knowing its molecular structure or