In [None]:
from datasets import load_dataset

dataset = load_dataset("KaifengGGG/WenYanWen_English_Parrallel")

In [None]:
dataset["train"][19]

In [None]:
import os
import google.generativeai as genai

API_KEY = "AIzaSyAgxaTRfr-xUWVSCi3ef36Olgp87oFo8kY"
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel(
    model_name='gemini-pro',
    safety_settings = [ 
    { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" }, 
    { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" }, 
    { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" }, 
    { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" } ]
)


In [None]:
TEMPLATES = [
    # without LLM augmented
    "[INST] Translate the following Classical Chinese text into modern Chinese: {classical} [/INST] {modern}",
    "[INST] Translate the following Classical Chinese text into English: {classical} [/INST] {english}",
    "[INST] Explain the meaning of this Classical Chinese passage in English: {classical} [/INST] {english}",
    "[INST] Convert the following English text into Classical Chinese: {english} [/INST] {classical}",
    "[INST] First, translate the Classical Chinese text into modern Chinese and then into English. Classical Chinese: {classical} [/INST] {modern} Then, {english}", #COT
    "[INST] Give the modern Chinese and English translation for the following Classical Chinese passage: {classical} [/INST] Modern Chinese: {modern} English: {english}",
    "[INST] Identify the key themes in this Classical Chinese text and explain them in English: {classical} [/INST] {english}",
    "[INST] For each of the following sets of Classical Chinese and Modern Chinese sentences, provide an English translation that captures the essence of both: Classical: {classical} Modern: {modern} [/INST] {english}",
    "[INST] Convert the following modern Chinese text into Classical Chinese: {modern} [/INST] {classical}",
    "[INST] Providing context, explain the historical and cultural background of the following Classical Chinese passage in English: {classical} [/INST] {english}",
    "[INST] Given the following Classical Chinese text, generate a modern interpretation that maintains the original's poetic tone or style: {classical} [/INST] {modern}",
    "[INST] Discuss the philosophical implications of this Classical Chinese passage in English: {classical} [/INST] {english}",
    "[INST] Create a modern dialogue based on the themes presented in this Classical Chinese passage: {classical} [/INST] {modern}",
    "[INST] Identify any idiomatic expressions in the Classical Chinese text and explain their meanings in both modern Chinese and English: {classical} [/INST] Modern Chinese: {modern} English: {english}",
    "[INST] Translate the following passage from Classical Chinese to both modern Chinese and English, focusing on preserving the emotional tone in both translations: {classical} [/INST] Modern Chinese: {modern} English: {english}",
]

TEMPLATES_AUGMENT = [
    # with LLM augmented
    "[INST] Analyze the grammatical structure of the following Classical Chinese sentence and provide its English translation: {classical} [/INST] Grammatical analysis: [...], English translation: {english}",
    "[INST] Given the Classical Chinese text: {classical}, provide a step-by-step translation process to arrive at the English translation. [/INST] Step 1: [...], Step 2: [...], ..., Final English translation: {english}",
    "[INST] Identify the tone and emotion conveyed in the Classical Chinese text and explain how it is achieved in English: {classical} [/INST] Tone and emotion: [...], Explanation in English: {english}",
    "[INST] Translate the following Classical Chinese proverb to English and explain its moral lesson: {classical} [/INST] English translation: {english}, Moral lesson: [...]",
    "[INST] Compare the Classical Chinese text with its English translation and discuss the challenges in preserving the original meaning and style: Classical Chinese: {classical}, English: {english} [/INST] Translation challenges: [...]",
    "[INST] Given the Classical Chinese text: {classical}, provide its Modern Chinese and English translations, and discuss the differences in grammar and vocabulary between Classical and Modern Chinese. [/INST] Modern Chinese: {modern}, English: {english}, Grammatical and vocabulary differences: [...]",
    "[INST] Given the Classical Chinese text: {classical}, provide its English translation and discuss the potential ambiguities or multiple interpretations of the text. [/INST] English translation: {english}, Potential ambiguities and interpretations: [...]",
    "[INST] Given the Classical Chinese text: {classical}, provide its Modern Chinese interpretation and English translation, along with a brief explanation of the translation process. [/INST] Modern Chinese: {modern}, English: {english}, Explanation: [...]",
    "[INST] Translate the following sentences from Classical Chinese to English, and describe any cultural or historical references present: {classical} [/INST] {english}, [...]",
    "[INST] Translate this English text into Classical Chinese. Then, explain any challenges encountered during the translation: {english} [/INST] {classical} Challenges: [...]",
]

In [None]:
import random

def format_sample_with_template(sample, augment):
    if augment:
        selected_template = random.choice(TEMPLATES + TEMPLATES_AUGMENT)
    else:
        selected_template = random.choice(TEMPLATES)
    required_keys = set()
    placeholders = ["classical", "modern", "english"]
    for placeholder in placeholders:
        if '{' + placeholder + '}' in selected_template:
            required_keys.add(placeholder)
    relevant_data = {k: sample.get(k, '') for k in required_keys}
    # Using exception handling to ignore placeholders if data is missing in the sample
    try:
        formatted_text = selected_template.format_map(relevant_data)
    except KeyError as e:
        print(f"Error processing template: {e}")
        formatted_text = selected_template
        
    if "..." in selected_template:
        prompt = "I have some corrupted text. Text: '" + formatted_text + "'.Fill in the '[...]' in the text with your the best of your knowledge and the context. Only Return the text in original format with all parts filled in"
        response = model.generate_content(prompt)
        formatted_text = response.text
        
    return formatted_text

In [None]:
def template_dataset(sample, augment):
    formatted_text = format_sample_with_template(sample, augment)
    sample["text"] = "<s>" + formatted_text + "</s>"
    return sample

In [None]:
dataset = load_dataset("KaifengGGG/WenYanWen_English_Parrallel", split="train[0:10000]")
dataset_mistral = dataset.map(template_dataset, remove_columns=list(dataset.features), fn_kwargs={'augment': True})

In [7]:
from datasets import load_from_disk
from datasets import concatenate_datasets

In [10]:
ds = []
for i in range(1,11):
    path = f"./dataset_instruct_ampp/subset_{i}"
    ds.append(load_from_disk(path))

In [11]:
dataset_instruct = concatenate_datasets(ds)
dataset_final = dataset_instruct.train_test_split(test_size=0.1)

In [12]:
dataset_final.push_to_hub("KaifengGGG/WenYanWen_English_Parrallel_Instruct", token="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")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/KaifengGGG/WenYanWen_English_Parrallel_Instruct/commit/bff828919187a53e42afe2bb46b8bcf9aa75514f', commit_message='Upload dataset', commit_description='', oid='bff828919187a53e42afe2bb46b8bcf9aa75514f', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
dataset_instruct[3]

{'text': "<s>[INST] Kindly elucidate the central themes in this passage in English: 《司马法》广陈三代，曰：古者六尺为步，步百为亩，亩百为夫，夫三为屋，屋三为井。 [/INST] The Book of Sima's Military Methods extensively described the systems of the Three Dynasties of antiquity, stating that: in ancient times, six chi (feet) equaled one bu (step), one hundred bu equaled one mu (acre), one hundred mu equaled one fu (man), three fu equaled one wu (group), and three wu equaled one jingtian (well-field).</s>"}

In [18]:
dataset_instruct[7]

{'text': '<s>[INST] Translate the following passage from Classical Chinese to both modern Chinese and English, focusing on preserving the emotional tone in both translations: 其所常居室壁，有笔洒墨迹者。 [/INST] Modern Chinese: 他常住的屋墙上有笔洒的墨迹。 English: There were traces of spilled ink on the walls of the house where he lived.</s>'}

In [17]:
dataset_instruct.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})