In [1]:
#ライブラリの自動リロード
%load_ext autoreload
%autoreload 2
from scoring import generate_prompt,evaluate_answer,eval_model

In [2]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import pipeline

#load base model
model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, 
                                             device_map="auto")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
pipe= pipeline("text-generation", model=model, tokenizer=tokenizer,max_new_tokens=100)

In [4]:
import json
dataset_path="../database/output/qa_dataset.json"
with open(dataset_path, "r") as f:
    raw_dataset = json.load(f)


In [5]:
#eval_model(raw_dataset[:50],pipe,"outputs/0929test/original_model.csv")

# peft

In [6]:
from peft import LoraConfig, get_peft_model

In [7]:
peft_config = LoraConfig(
        task_type="CAUSAL_LM", inference_mode=False, r=8, lora_alpha=32,
        lora_dropout=0.1
    )
model = get_peft_model(model, peft_config)

In [8]:
tokenizer.pad_token = tokenizer.eos_token

In [9]:
import transformers
from datasets import load_dataset

context_path="../database/output/context0926.json"
with open(context_path, 'r') as f:
    context_list = json.load(f)

context_list=context_list[:200]

with open("trainset/temp_train.txt","w") as f:
    for context in context_list:
        f.write(context+"\n")

train_dataset = load_dataset("text", data_files="trainset/temp_train.txt")
train_dataset=train_dataset.map(lambda samples: tokenizer(samples['text']), batched=True)



Downloading and preparing dataset text/default to /home/user/.cache/huggingface/datasets/text/default-7856b213db3cf53e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/user/.cache/huggingface/datasets/text/default-7856b213db3cf53e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps=len(train_dataset['train'])*1,
        learning_rate=2e-5,
        fp16=True,
        logging_steps=10,
        output_dir='outputs',
        #optim=""
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.7599
20,2.7362
30,2.5939
40,2.5747
50,2.4708
60,2.4974
70,2.5603
80,2.5342
90,2.3639
100,2.4662


TrainOutput(global_step=200, training_loss=2.483585948944092, metrics={'train_runtime': 502.1699, 'train_samples_per_second': 1.593, 'train_steps_per_second': 0.398, 'total_flos': 7754540595609600.0, 'train_loss': 2.483585948944092, 'epoch': 4.0})

In [11]:
peft_name = "model/0929test"
trainer.model.save_pretrained(peft_name)
tokenizer.save_pretrained(peft_name)

('model/0929test/tokenizer_config.json',
 'model/0929test/special_tokens_map.json',
 'model/0929test/tokenizer.model',
 'model/0929test/added_tokens.json',
 'model/0929test/tokenizer.json')

In [13]:

eval_model(raw_dataset[:50],pipe,"res/0929test/peft200.csv")

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [05:56<00:00,  7.12s/it]


Unnamed: 0,problem,answer,pred,score,type,context
0,You are a great scientist. Anser the following...,3,3. A glucose biofuel cell (GBFC) consists of t...,1.0,multi,False
1,You are a great scientist. Anser the following...,1,1. LmrR has been shown to bind the compounds s...,1.0,multi,True
2,You are a great scientist. Anser the following...,1,"1. In the clinical trial, disease progression ...",1.0,multi,True
3,You are a great scientist. Anser the following...,1,1. SmMIT-LAMP is a LAMP (Loop-mediated isother...,1.0,multi,False
4,You are a great scientist. Anser the following...,The key components of a LEID device structure ...,A LEID device structure typically consists of...,0.485981,gen,True
5,You are a great scientist. Anser the following...,1,1. Point-contact spectroscopy and nuclear quad...,1.0,multi,True
6,You are a great scientist. Anser the following...,3,3. A QKD link requires both a quantum channel ...,1.0,multi,False
7,You are a great scientist. Anser the following...,1,1. The electromagnetic enhancement in SERS is ...,1.0,multi,True
8,You are a great scientist. Anser the following...,A TRA transmitter is constructed from two terr...,A TRA (Time-division duplex Radio) transmitte...,0.057143,gen,False
9,You are a great scientist. Anser the following...,1,4. A synthetic dimension allows realization of...,0.0,multi,True
