In [1]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"


In [2]:

model = transformers.AutoModelForCausalLM.from_pretrained(model_id,
torch_dtype=torch.bfloat16,  device_map="auto")

tokenizer=transformers.AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
    "text-generation", model=model, 
    tokenizer=tokenizer
)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
#LoRA
from peft import get_peft_model, LoraConfig, TaskType
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                         inference_mode=False, # 学習時はFalse
                         r=32, 
                         lora_alpha=64,
                         lora_dropout=0.05,
                         bias="none",
                         target_modules=["q_proj", "v_proj", "k_proj", "o_proj", 
                                         #"gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
                                         ],
                         )

# モデルにLoRAアダプター適用、更新対象のパラメータ数の確認
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

trainable params: 131,072,000 || all params: 70,684,778,496 || trainable%: 0.1854


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
jsonl_path="data/20240918175257_llm_gen.jsonl"
error_threshold=0.1

df=pd.read_json(jsonl_path,lines=True)
df=df.drop(columns=["record","prompt"])
df=df.sort_values(by="error_rate")
df["cond"]=df["CompoundName"]+" "+df["SMILES"]+" "+df["Property"]
df=df.drop_duplicates(subset=["cond"])
df=df[df["error_rate"]<error_threshold]
df["q"]="Predict "+df["Property"]+" "+df["Unit"]+" for "+df["CompoundName"]+" (Compoun X) with SMILES "+df["SMILES"]+". The prediction consists of #Reason and #Prediction. The #Reason is the quantitative explanation of the prediction. The #Prediction is the predicted value and the unit of the prediction."
df["a"]="#Reason\n"+df["reason"]+"\n#Prediction\n"+df["predicted"].astype(str)+" "+df["Unit"]
q_list=df["q"].tolist()
a_list=df["a"].tolist()

In [5]:
train_text_list=[]
for i in range(len(q_list)):
    q=q_list[i]
    a=a_list[i]
    messages=[
        {"role": "user", "content": q},
        {"role": "assistant", "content": a}
    ]
    prompt=tokenizer.apply_chat_template(messages,tokenize=False)
    train_text_list.append(prompt)

train_text_list

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nPredict Boiling temperature [oC] for Octafluoropropane (Compoun X) with SMILES FC(F)(F)C(F)(F)C(F)(F)F. The prediction consists of #Reason and #Prediction. The #Reason is the quantitative explanation of the prediction. The #Prediction is the predicted value and the unit of the prediction.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n#Reason\nTo predict the boiling temperature of Compound X, we can use the concept of functional group contributions to boiling points. The boiling point of a molecule can be estimated by summing the contributions of its constituent functional groups.\n\nCompound X has the molecular formula C3F8. We can break it down into three functional groups:\n\n* Three CF3 groups (each contributing approximately 17.5 °C to the boiling point)\n* Two CF2 groups (each contributi

In [6]:
import datasets
ds=datasets.Dataset.from_list([{"text":t} for t in train_text_list])

In [7]:
from trl import DataCollatorForCompletionOnlyLM

# response_templateは必須指定
response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['text'])):
        text = f"{example['text'][i]}"
        output_texts.append(text)
    return output_texts

In [8]:
from transformers import TrainingArguments
from trl import SFTTrainer

# SFTTrainerはTrainingArgumentsを使用することができる。
# 指定しない場合、TrainingArgumentsのデフォルトが指定される。
args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=8,
    save_strategy="no",
    logging_steps=1,
    lr_scheduler_type="constant",
    save_total_limit=1,
    fp16=True,
)


# data_collatorが指定されていない場合、以下のようにDataCollatorForLanguageModelingがmlm=Falseで使われる。
# つまり通常のCausal LMを学習することになる。
# if data_collator is None:
#     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# packing=False（default）ではdataset_text_fieldかformatting_funcを指定する必要あり
trainer = SFTTrainer(
    lora_model,
    args=args,
    train_dataset=ds,
    formatting_func=formatting_prompts_func,
    max_seq_length=1024,
    data_collator=collator,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/993 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:

tokenizer.pad_token = tokenizer.eos_token

#loss maskのcheck
from torch.utils.data import DataLoader
print(tokenizer.decode(trainer.train_dataset[0]['input_ids']))


loader = DataLoader(trainer.train_dataset, collate_fn=collator, batch_size=8)
batch = next(iter(loader))
print(batch['labels'][0])

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict Boiling temperature [oC] for Octafluoropropane (Compoun X) with SMILES FC(F)(F)C(F)(F)C(F)(F)F. The prediction consists of #Reason and #Prediction. The #Reason is the quantitative explanation of the prediction. The #Prediction is the predicted value and the unit of the prediction.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

#Reason
To predict the boiling temperature of Compound X, we can use the concept of functional group contributions to boiling points. The boiling point of a molecule can be estimated by summing the contributions of its constituent functional groups.

Compound X has the molecular formula C3F8. We can break it down into three functional groups:

* Three CF3 groups (each contributing approximately 17.5 °C to the boiling point)
* Two CF2 groups (each contributi

In [10]:
trainer.train()

Step,Training Loss
1,0.7197
2,0.6751


In [None]:

trainer.save_model()