In [2]:
from vllm import LLM, SamplingParams

model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"

tensor_parallel_size = 1
# トークナイザーとモデルの準備
model = LLM(
    model=model_id,
    trust_remote_code=True,
    max_model_len=2000,
    tensor_parallel_size=tensor_parallel_size,
)

In [8]:

import transformers
def llm_gen(model,prompt_list,temperature=0.0,top_k=50):
    outputs = model.generate(
        prompt_list,
        sampling_params=SamplingParams(
            temperature=temperature,
            max_tokens=1024,
            #repetition_penalty=1.2,
            top_k=top_k,
        )
    )
    return [i.outputs[0].text.strip() for i in outputs]


tokenizer=transformers.AutoTokenizer.from_pretrained(model_id)

In [3]:
from datasets import load_dataset

ds=load_dataset("kanhatakeyama/material-properties",split="train")
#ds=ds.shuffle()

In [18]:


def gen_problem(record):
    comp_name=record["CompName"]
    smiles=record["SMILES"]
    unit=record["unit"]
    property_name=record["Property"]
    q=f"""Predict the {property_name} {unit} of the following compound. 
    #Restriction: Output can contain "#Reason" section which explains the reasoning behind the prediction. Output must contain "#Prediction" section which contains only the predicted value (number only).
    #Name: {comp_name}
    #SMILES: {smiles}"""
    actual_value=record["Value"]

    chat = [
        {"role": "user", "content": q},
    ]


    prompt=tokenizer.apply_chat_template(chat,tokenize=False,)
    assist_prefix="assistant\n\n"
    prompt+=assist_prefix

    return prompt,actual_value

In [29]:
def extract_answer_from_text(text):
    lines=text.split("\n")
    for i,line in enumerate(lines):
        if line.startswith("#Prediction"):
            target_line=lines[i+1].strip()
            noise=[" ","\t","#"]
            for n in noise:
                target_line=target_line.replace(n,"")
            return target_line              
    return None

In [42]:
from tqdm import tqdm
predictions=[]
batch_size=1000

for i in tqdm(range(0,len(ds),batch_size)):
    batch=ds.select(range(i,i+batch_size))
    prompts=[]
    for record in batch:
        prompt,actual_value=gen_problem(record)
        prompts.append(prompt)
    outputs=llm_gen(model,prompts)
    for record,output in zip(batch,outputs):
        predicted_value=extract_answer_from_text(output)
        record["predicted_value"]=predicted_value
        record["predicted_text"]=output
        predictions.append(record)

    break

  0%|          | 0/8 [00:00<?, ?it/s]

In [43]:
import os
import json
os.makedirs("eval_results",exist_ok=True)
save_path=f"eval_results/{model_id.replace('/','_')}.json"
with open(save_path,"w") as f:
    json.dump(predictions,f,indent=2)

[{'CompName': 'Ammonia',
  'SMILES': 'N',
  'Property': 'Viscosity',
  'Value': 0.276,
  'unit': '[mPas]',
  'Source': 'Wikipedia/Wikidata',
  'predicted_value': '0.34',
  'predicted_text': '#Prediction\n# 0.34\n\n#Reason\nTo predict the viscosity of ammonia, we need to consider its molecular structure and properties. Ammonia (NH3) is a polar molecule with a relatively small molecular weight (17.03 g/mol) and a low molecular volume. \n\nThe viscosity of a gas is typically very low, and ammonia is no exception. At standard temperature and pressure (STP), the viscosity of ammonia is approximately 0.34 mPa·s. This value is relatively low due to the small size and low molecular weight of the ammonia molecule, which allows it to move freely and easily past each other, resulting in low resistance to flow.'},
 {'CompName': 'Ammonia',
  'SMILES': 'N',
  'Property': 'Vapor pressure',
  'Value': 857.3,
  'unit': '[kPa]',
  'Source': 'Wikipedia/Wikidata',
  'predicted_value': '6.32',
  'predicted

In [1]:
t="PPaPPw"
t.split("P")

['', '', 'a', '', 'w']