In [1]:
from typing import List, Dict, Sequence
import torch
import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer

from dataclasses import dataclass, field

from setting import OUTPUT_FOLDER
IGNORE_INDEX = -100
device = "cuda:0"

In [3]:
model = AutoModelForCausalLM.from_pretrained(OUTPUT_FOLDER)
model = model.half().to(device)
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_FOLDER, padding_side="right")

In [5]:
import pandas as pd

file = "../data.xlsx"
df = pd.read_excel(file)


def build_prompt(name, text, label):
    instruct = f"请你给{name}写一首诗：{text}"
    label = f"{label}"
    return {"input_ids": instruct, "labels": instruct + label + tokenizer.eos_token}


data = []
for _, row in df.iterrows():
    data.append(build_prompt(row["name"], row["text"], row["label"]))
data[0]

{'input_ids': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
 'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'}

In [6]:
def infer(text):
    input_ids = tokenizer(text, return_tensors="pt").to(model.device)

    generated_ids = model.generate(**input_ids)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(input_ids.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [7]:
data[0]

{'input_ids': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
 'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'}

In [8]:
infer("请你给太乙真人写一首诗：")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


'炉中炼术，指点神童。'

In [9]:
data

[{'input_ids': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。',
  'labels': '请你给哪吒写一首诗：哪吒降世，意气飞扬。\n逆天改命，破障冲霄。红绫缠腕，风火踏浪。\n不屈不悔，笑傲苍茫。<|endoftext|>'},
 {'input_ids': '请你给敖丙写一首诗：碧海生龙子，云中舞雪霜。',
  'labels': '请你给敖丙写一首诗：碧海生龙子，云中舞雪霜。恩仇难两忘，何处是家乡？<|endoftext|>'},
 {'input_ids': '请你给殷夫人写一首诗：十月怀胎盼子生，柔心铁骨两相承。',
  'labels': '请你给殷夫人写一首诗：十月怀胎盼子生，柔心铁骨两相承。甘将慈爱护天地，不惧风雷不惧征。<|endoftext|>'},
 {'input_ids': '请你给太乙真人写一首诗：仙风道骨，骑兽遨游。',
  'labels': '请你给太乙真人写一首诗：仙风道骨，骑兽遨游。炉中炼术，指点神童。<|endoftext|>'},
 {'input_ids': '请你给申公豹写一首诗：阴谋藏心，步步为营。\n狂傲不羁，志向高冥。',
  'labels': '请你给申公豹写一首诗：阴谋藏心，步步为营。\n狂傲不羁，志向高冥。欲翻天命，终难遂行。\n困局自招，悔恨难平。<|endoftext|>'}]

In [11]:
infer("请你给申公豹写一首诗：阴谋藏心")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


'，欲翻天命，终难遂行。\n困局自招，悔恨难平。'

In [12]:
infer('逆天改命，破障冲霄。红绫缠腕，')

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


'风火踏浪。\n不屈不悔，笑傲苍茫。'

In [7]:
tokenizer.pad_token, tokenizer.eos_token

('<|endoftext|>', '<|endoftext|>')