In [2]:
import torch
import transformers, evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel
import time
from datasets import load_dataset

In [3]:
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model_path = "Llama-3-Han"
dataset_path = "KaifengGGG/WenYanWen_English_Parrallel"
dataset_config = "instruct"

In [3]:
def load_peft_model_and_tokenizer(model_path, new_model_path):
    """Loads a PEFT model from the given path and merges it with the base model."""
    base_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    model = PeftModel.from_pretrained(base_model, new_model_path)
    model = model.merge_and_unload()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return model, tokenizer

def load_base_model_and_tokenizer(model_path):
    """Loads the base transformer model and tokenizer from the given path."""
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    return model, tokenizer

In [None]:
model, tokenizer = load_peft_model_and_tokenizer(model_path, new_model_path)
# model, tokenizer = load_base_model_and_tokenizer(model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16}
)

In [9]:
messages = [
    {"role": "user", "content": "Given the Classical Chinese text: 识者恶之。, provide a step-by-step translation process to arrive at the English translation."},
]

prompt = pipeline.tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)
prompt


'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGiven the Classical Chinese text: 识者恶之。, provide a step-by-step translation process to arrive at the English translation.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [10]:
terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [11]:
outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [12]:
outputs[0]["generated_text"][len(prompt):]

'A great example! Here\'s a step-by-step translation process for the Classical Chinese text:\n\n**Text:** 识者恶之 (shí zhě è zhī)\n\n**Step 1: Break down the text into its constituent parts**\n\n* 识 (shí) - a verb meaning "to know" or "to recognize"\n* 者 (zhě) - a particle indicating the subject of the sentence\n* 恶 (è) - a verb meaning "to hate" or "to dislike"\n* 之 (zhī) - a particle indicating the object of the sentence\n\n**Step 2: Identify the grammatical structure**\n\n* The sentence is in the passive voice, with the subject (者) receiving the action of the verb (识).\n* The verb (识) is in the present tense, indicating a general truth or a habitual action.\n* The object (之) is the recipient of the action of the verb (恶).\n\n**Step 3: Translate each part**\n\n* 识 (shí) - "to know" or "to recognize"\n* 者 (zhě) - "the one who" or "the person who"\n* 恶 (è) - "hates" or "dislikes"\n* 之'

In [13]:
print(outputs)

[{'generated_text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGiven the Classical Chinese text: 识者恶之。, provide a step-by-step translation process to arrive at the English translation.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA great example! Here\'s a step-by-step translation process for the Classical Chinese text:\n\n**Text:** 识者恶之 (shí zhě è zhī)\n\n**Step 1: Break down the text into its constituent parts**\n\n* 识 (shí) - a verb meaning "to know" or "to recognize"\n* 者 (zhě) - a particle indicating the subject of the sentence\n* 恶 (è) - a verb meaning "to hate" or "to dislike"\n* 之 (zhī) - a particle indicating the object of the sentence\n\n**Step 2: Identify the grammatical structure**\n\n* The sentence is in the passive voice, with the subject (者) receiving the action of the verb (识).\n* The verb (识) is in the present tense, indicating a general truth or a habitual action.\n* The object (之) is the recipient of the action of the verb (恶).\n\n**St

In [23]:
dataset = load_dataset(dataset_path, dataset_config)
dataset_train, dataset_test = dataset.select_columns(["classical", "english"]).values()

In [44]:
for row in dataset_test.select(range(5)):
    print(row)
print(len(dataset_test))

{'classical': '冰益壮，地始坼，鹖鴠不鸣，虎始交。', 'english': "During this month, the ice becomes increasingly solid and cracks appear on the ground's surface. The oriole stops singing, and tigers begin mating."}
{'classical': '孟尝君闻冯驩烧券书，怒而使使召驩。', 'english': 'When Meng Changjun heard the news that Feng Xuan had burned the promissory notes, he was very angry and immediately sent someone to recall Feng Xuan.'}
{'classical': '本枝百世，继绪不忘。继绪不忘，休有烈光。', 'english': 'The direct lineage and collateral branches continue for a hundred generations, never forgetting to inherit the cause of their ancestors.'}
{'classical': '自今案《论语》之文，孔子之言多若笑弦歌之辞，弟子寡若子游之难，故孔子之言遂结不解。', 'english': "Even when examining the original text of the Analects, many of Confucius's remarks still sound like mockery of playing the qin and singing, but among his disciples, few dared to argue back like Ziyou, so Confucius's words have always been somewhat like a dead end and could not be untied."}
{'classical': '己卯，使持节、侍中、都督扬州诸军事、扬州刺史、骠骑将军、录尚书事、都乡侯何充

In [15]:
prompts = []
references = []
for message in dataset_test["messages"]:
    message = pipeline.tokenizer.apply_chat_template(
        message, 
        tokenize=False, 
        add_generation_prompt=False)
    prompt, separator, reference = message.partition("<|start_header_id|>assistant<|end_header_id|>\n\n")
    prompt = prompt + separator
    prompts.append(prompt)
    references.append(reference)

In [16]:
references[0]

'Step 1: Identify the individual characters and their meanings:\n识 (shí) - to know, to recognize\n者 (zhě) - a person who knows, a connoisseur\n恶 (wù) - to hate, to loathe\n之 (zhī) - it\n\nStep 2: Combine the characters into phrases and determine their grammatical function:\n识者 (shízhě) - people with good judgment\n恶之 (wùzhī) - to greatly loathe it\n\nStep 3: Form the complete sentence:\n识者恶之。(Shízhě wùzhī.) - People with good judgment greatly loathed this.\n\nFinal English translation: People with good judgment greatly loathed this.<|eot_id|>'

In [17]:
prompts, references = prompts[:10], references[:10]
prompts

['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGiven the Classical Chinese text: 识者恶之。, provide a step-by-step translation process to arrive at the English translation.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nOffer an English rendition of the specified Classical Chinese text, then elaborate on the grammatical and lexemic variances from Classical Chinese to English: 虽有急，弗畔；虽有离，弗畔。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n请提供这段文言文的现代汉语解释: 因锁闭堂中四十余日，益加爽秀，寰方惊骇焉。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n将以下现代汉语文本改写为文言文: 项羽派人回去报告楚怀王。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nTranspose the following Modern Chinese text into Classical Chinese: 马大同镇守京口，

In [18]:
outputs = pipeline(
    prompts,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [19]:
predictions = [output[0]['generated_text'][len(prompt):] for output, prompt in zip(outputs, prompts)]
print(predictions[0])

Here's a step-by-step translation process for the Classical Chinese text:



**Step 1: Identify the characters**

* (shí) - "to know" or "to recognize"
* (zhě) - "that" or "which"
* (è) - "evil" or "bad"
* (zhī) - "of" or "belonging to"

**Step 2: Analyze the grammatical structure**

* The sentence uses the verb-subject-object (VSO) word order, which is common in Classical Chinese.
* The verb (識) is in the present tense, indicating a general or habitual action.

**Step 3: Identify the subject**

* The subject of the sentence is (識), which is a nominalized verb meaning "to recognize" or "to know". In this context, it's a abstract noun representing the concept of recognition or understanding.

**Step 4: Identify the predicate**

* The predicate is (恶之), which consists of two characters:
	+ (è) - "evil" or "bad"
	+ (zhī) - "of" or "belonging to"

**Step 5: Determine the meaning of the predicate**

* The character (è)


In [20]:
sacrebleu = evaluate.load("sacrebleu")
sacrebleu_results=sacrebleu.compute(predictions=predictions, references=references)
sacrebleu_results

{'score': 1.675587669619537,
 'counts': [113, 43, 20, 7],
 'totals': [1729, 1719, 1709, 1699],
 'precisions': [6.5355696934644305,
  2.501454333915067,
  1.1702750146284377,
  0.4120070629782225],
 'bp': 1.0,
 'sys_len': 1729,
 'ref_len': 280}

In [54]:
!pip install evaluate



In [55]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-5.2.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading lxml-5.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading portal