# Classical-Vernacular Chinese Translation
In this project, we leverage the Taiwan-LLM v2.0 7B model for Classical-Vernacular Chinese Translation.
## Setup

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
login()

Mounted at /content/drive
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Load Model & PEFT Config.

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


model_id = "yentinglin/Taiwan-LLM-7B-v2.0-chat"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

tokenizer_config.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "lm_head", "down_proj", "up_proj"],
    lora_dropout=0.01
)

model = get_peft_model(model, config)
model.print_trainable_parameters()
# print(model)

trainable params: 10,138,624 || all params: 6,748,554,240 || trainable%: 0.1502340151599641
PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.01, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
   

## Load Datasets & Pre-processing

In [5]:
import pandas as pd
import datasets
from sklearn.model_selection import train_test_split

def get_prompt(instruction: str) -> str:
    instruction.replace('\n', '')
    return f"你是人工智慧助理，以下是用戶和人工智能助理之間的對話。你要對用戶的問題提供有用、安全、詳細和禮貌的回答。USER: {instruction} ASSISTANT:"

def reformat(df):
    df.rename(columns={'output': 'labels'}, inplace=True)
    df['instruction'] = df['instruction'].apply(lambda instruction: get_prompt(instruction))
    return df

raw_df = pd.read_json('/content/drive/MyDrive/ADL_HW3/data/train.json')
raw_df = reformat(raw_df)
train_df, valid_df = train_test_split(raw_df, test_size=0.2, random_state=42)

train_dataset = datasets.Dataset.from_pandas(train_df)
valid_dataset = datasets.Dataset.from_pandas(valid_df)

instr_column = 'instruction'
label_column = 'labels'
column_names = train_dataset.column_names

def preprocess_function(examples):
    inputs, targets = [], []
    for i in range(len(examples[instr_column])):
        inputs.append(examples[instr_column][i])
        targets.append(examples[label_column][i])

    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(text_target=targets, max_length=400, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function,batched=True,remove_columns=column_names,desc="Running tokenizer on train dataset")
valid_dataset = valid_dataset.map(preprocess_function,batched=True,remove_columns=column_names,desc="Running tokenizer on valid dataset")


Dataset({
    features: ['id', 'instruction', 'labels'],
    num_rows: 8000
})


Running tokenizer on train dataset:   0%|          | 0/8000 [00:00<?, ? examples/s]

Running tokenizer on valid dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [6]:
## Plot Distribution of Token

# import matplotlib.pyplot as plt
# import pandas as pd

# def preprocess_function(examples):

#     inputs, targets = [], []
#     inputs.append(examples[instr_column])
#     targets.append(examples[label_column])
#     model_inputs = tokenizer(inputs)
#     labels = tokenizer(text_target=targets)

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# lengths = []
# for example in train_dataset:
#     model_inputs = preprocess_function(example)
#     lengths.append(len(model_inputs['attention_mask'][0]))

# plt.hist(lengths, bins=range(min(lengths), max(lengths) + 1, 1), alpha=0.7, color='blue', edgecolor='black')
# plt.title('Distribution of Instruction Lengths')
# plt.xlabel('Length of Tokenized Input')
# plt.ylabel('Frequency')
# plt.show()

## Trainer

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        learning_rate=3e-5,
        # max_steps=101,
        num_train_epochs=2,
        do_eval=True,
        per_device_eval_batch_size=8,
        evaluation_strategy="steps",
        eval_steps=400,
        logging_dir='./logs',
        logging_steps=100,
        optim="paged_adamw_8bit",
        output_dir="outputs",
        save_strategy="steps",
        save_steps=400,
        hub_model_id="HomoLiang/ADL_HW3",
        push_to_hub=True,
        hub_strategy="every_save",
        fp16=True,
        remove_unused_columns=False
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
100,0.2485,No log
200,0.3114,No log
300,0.2262,No log
400,2.1806,No log
500,0.813,No log
600,1.4323,No log
700,0.1568,No log
800,0.102,No log




Step,Training Loss,Validation Loss
100,0.2485,No log
200,0.3114,No log
300,0.2262,No log
400,2.1806,No log
500,0.813,No log
600,1.4323,No log
700,0.1568,No log
800,0.102,No log
900,0.1523,No log


In [None]:
!zip -r /content/outputs.zip /content/outputs/
from google.colab import files
files.download("/content/outputs.zip")

##Verification

In [None]:
del model
torch.cuda.empty_cache()

!python /content/drive/MyDrive/ADL_HW3/ppl.py --peft_path /content/drive/MyDrive/ADL_HW3/outputs_3modules/checkpoint-1200 --test_data_path /content/drive/MyDrive/ADL_HW3/data/public_test.json

## Inference

In [None]:
import pandas as pd
from transformers import GenerationConfig
import json

def get_prompt(instruction: str) -> str:
    instruction.replace('\n', '')
    return f"你是人工智慧助理，以下是用戶和人工智能助理之間的對話。你要對用戶的問題提供有用、安全、詳細和禮貌的回答。USER: {instruction} ASSISTANT:"

def reformat(df):
    df['instruction'] = df['instruction'].apply(lambda instruction: get_prompt(instruction))
    return df


inference_df = pd.read_json('/content/drive/MyDrive/ADL_HW3/data/private_test.json')
inference_df = reformat(inference_df)

id_list = []
output_list = []
device = "cuda:0"

pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff。，、]+')

for i in range(len(inference_df)):
    inputs = tokenizer(inference_df["instruction"][i], max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, 
        generation_config=GenerationConfig(
            do_sample=True,
            max_new_tokens=256,
            num_beams=3
        )
    )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    sliced_output_text = output_text[output_text.rfind("ASSISTANT:") + len("ASSISTANT:"):].strip()
    sliced_output_text = ''.join(pattern.findall(sliced_output_text))
    
    id_list.append(inference_df["id"][i])
    output_list.append(sliced_output_text)

with open("prediction.json", 'w', encoding='utf-8') as f:
    for data_id, output in zip(id_list, output_list):
        data = {"id": data_id, "output": output}
        json.dump(data, f, ensure_ascii=False)
        f.write("\n")