## Cultural Alignment LLMs - *Part 2*

- Prompt Tuning and Model Testing.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 0. Setup

- Import Dependencies

In [5]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftModel
from sklearn.model_selection import train_test_split
import sys
sys.path.append('/content/drive/MyDrive/CulturalAlignment')
import warnings
warnings.filterwarnings('ignore')

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
# Make path to save the model
out_path = '/content/drive/MyDrive/CulturalAlignment/result'
if not os.path.exists(out_path):
     os.makedirs(out_path)

### 1. Training

#### 1.1 Load Data & Preprocess

- Read the cleaned data from saved file 'exam.csv'

In [8]:
data_path = './Datasets/exam.csv'
#data_path = '/content/drive/MyDrive/CulturalAlignment/Datasets/exam.csv'
df = pd.read_csv(data_path)
df = df.fillna('')
df = df.drop(columns=['Unnamed: 0'])
df.sample(3)

Unnamed: 0,instruction,input,output,task_type
2402,请选择正确的选项\n,"19世纪后期,日本明治政府规定:任何儿童就学年限不得少于3年。同时,又颁布《教育敕语》,宣扬...",题干的“宣扬‘忠于天皇’是最高的道德规范。”反映了明治维新保留了封建制度的残余，故③符合题意...,选择题
86,这句话的含义是什么：佛高一尺，魔高一丈,,原为佛家告诫修行者，要警惕外界诱惑。佛，指佛法；魔，指魔法。后用以比喻一方势力（多指正义的）...,问答
1167,自闭是否有明显的分界点？,,自闭症（Autism）是一种神经发育障碍，其表现形式复杂，症状各异。目前还没有明显的自闭症诊...,问答


In [None]:
# Remove '\n'
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False)
    return text_df

df["instruction"] = prep_text(df["instruction"])

In [9]:
# Prepare Data for Training (8:1:1)
print("Data Splitting...")

train_df,test_df = train_test_split(df,test_size=0.2,shuffle=True)
print("Size of Validation data: ",len(train_df))

val_df,test_df = train_test_split(test_df,test_size=0.5,shuffle=True)
print("Size of Validation data: ",len(val_df))
print("Size of Testing data: ",len(test_df))

Data Splitting...
Size of Validation data:  2688
Size of Validation data:  336
Size of Testing data:  337


- Pre-processing

Before training a model on a dataset(from pandas to dataset first), it needs to be preprocessed into the expected model input format (be converted and assembled into batches of tensors).

In [10]:
IGNORE_INDEX = -100

In [16]:
from torch.utils.data import Dataset

# Create a custom Dataset
class MyDataset(Dataset):
  def __init__(self, data_csv, tokenizer):
    self.df = data_csv
    self.tokenizer = tokenizer

  def __getitem__(self, index):
    data = self.df.iloc[index]
    instruction,input,output = data['instruction'],data['input'],data['output']

    # For QA questions, input == ""
    if input is not None and input != "":
      instruction = instruction+'\n'+input

    # Regularization
    source = f"问题：{instruction}\n答案："
    target = f"{output}{self.tokenizer.eos_token}"

    # Tokenize Source
    tokenzied_source = self.tokenizer(source,
                                      max_length=512,
                                      padding='max_length',
                                      truncation=True)
    # Tokenize Target
    tokenized_target = self.tokenizer(target,
                                      max_length=512,
                                      padding='max_length',
                                      truncation=True)

    # Get (input_ids, attention_mask, and labels) for Model Training
    ## torch.LongTensor(): int64 data type value
    ## [input_ids]
    input_ids = torch.LongTensor(tokenzied_source['input_ids'] + tokenized_target['input_ids'])

    ## [attention_mask]
    attention_mask = torch.LongTensor(tokenzied_source['attention_mask'] + tokenized_target['attention_mask'])

    ## [labels]
    labels = torch.LongTensor([IGNORE_INDEX] * len(tokenzied_source['input_ids']) + tokenized_target['input_ids'])

    return {'input_ids':input_ids,
            'attention_mask':attention_mask,
            'labels':labels}

  def __len__(self):
    return len(self.df)


#### 1.2 Load the pretarined LLM and its tokenizer

In [None]:
# 1. Load a Chinese-centric LLM as the base model.
MODEL_NAME = "m-a-p/CT-LLM-Base"

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False,trust_remote_code=True)

if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
# 2. Load Chinese Llama2
MODEL_NAME = "hfl/chinese-llama-2-7b"

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

In [17]:
train_data = MyDataset(train_df,tokenizer)
val_data = MyDataset(val_df,tokenizer)
test_data = MyDataset(test_df,tokenizer)

In [21]:
train_data[0]['input_ids'].shape

torch.Size([1024])

- Test the Model with Zero Shot Inferencing.

In [24]:
idx = 1000

instruction = df.iloc[idx]['instruction']
input = df.iloc[idx]['input']
output = df.iloc[idx]['output']

prompt = f"""
结合中国文化背景和提示，回答问题。

问题：{instruction}\n{input}

答案：
"""

print(f'INPUT PROMPT:\n{prompt}')
print()
print(f'BASELINE HUMAN ANSWER:\n{output}\n')

INPUT PROMPT:

结合中国文化背景和提示，回答问题。

问题：视障人士是否应该被允许在所有场合自由行动，包括公共交通工具、商店、餐馆等？


答案：


BASELINE HUMAN ANSWER:
是的，这是一个基本的人权，视障人士应该被允许在公共交通工具、商店和餐馆等场合自由行动。这是因为视障人士是弱势群体，他们需要特殊的照顾和支持，以便他们能够正常地生活和工作。 
 
然而，在一些国家和地区，由于各种原因，视障人士可能无法在某些场合自由行动，例如在公共交通工具上，商店和餐馆里，或者在某些特定的环境中。因此，为了确保视障人士在这些场合自由行动，我们需要采取措施，例如提供特殊的设施，以便他们能够在这些场合中自由行动。 
 
总的来说，视障人士应该在所有场合获得平等的权利和机会，以便他们能够自由地生活和工作。



In [25]:
# Define model inference function
def model_inference(question,model,tokenizer):
  ipt = tokenizer(question,return_tensors='pt').to(model.device)
  opt = model.generate(**ipt, max_new_tokens=128, do_sample=True)   # To avoid repetitive, boring outputs
  answer = tokenizer.decode(opt[0], skip_special_tokens=True)

  print(f'MODEL GENERATION - ZERO SHOT:\n{answer}')

  # Free up GPU memory
  del ipt,opt,answer,
  torch.cuda.empty_cache()


In [29]:
# Move model to GPU
model = model.to(device)

In [31]:
# Example
model_inference("暗度陈仓：成语释义：",model,tokenizer)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


MODEL GENERATION - ZERO SHOT:
暗度陈仓：成语释义：指暗中策划，准备行动。 出处：《晋书·王敦传》：“敦既以陈仓为事，又与诸将谋，欲以兵袭长安。” 示例： 近义词: 反义词: 语法: 作谓语、定语；指暗中策划 作宾语、定语；用于书面语 作谓语、定语；指暗中策划 作谓语、定语；指暗中策划 作谓语、定语；指暗中策划 作谓语、定语；指暗中策划 作谓语、定语；指


#### 1.3 Define Configs

In [26]:
# Hard/Soft Prompt config
# Initialize the soft prompt with the following text:
prompt_tuning_init_text = '结合中国文化背景和提示，回答问题。'

peft_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM,
    prompt_tuning_init = PromptTuningInit.TEXT,
    prompt_tuning_init_text = '结合中国文化背景和提示，回答问题。',
    num_virtual_tokens = len(tokenizer(prompt_tuning_init_text)['input_ids']),
    tokenizer_name_or_path = MODEL_NAME,
)

peft_config

PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, num_virtual_tokens=8, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, prompt_tuning_init=<PromptTuningInit.TEXT: 'TEXT'>, prompt_tuning_init_text='结合中国文化背景和提示，回答问题。', tokenizer_name_or_path='m-a-p/CT-LLM-Base', tokenizer_kwargs=None)

In [27]:
# Hyperparameters
args = TrainingArguments(
    seed = 42,
    output_dir = out_path,
    per_device_train_batch_size = 4,    # BATCH_SIZE
    per_device_eval_batch_size = 4,
    #gradient_accumulation_steps = 8,
    logging_steps = 10,
    #learning_rate = ,                   # Learning_rate
    num_train_epochs = 2,              # Epoch

)

#### 1.4 Prompt Tuning

- Initialize Model for prompt-tuning

In [28]:
model = get_peft_model(model,peft_config)
model

The repository for m-a-p/CT-LLM-Base contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/m-a-p/CT-LLM-Base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PeftModelForCausalLM(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(125824, 2048)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
            (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
            (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm)

In [30]:
print(model.print_trainable_parameters())

trainable params: 16,384 || all params: 2,134,525,952 || trainable%: 0.0008
None


- Define Trainer & Start training

In [33]:
trainer = Trainer(
    model = model,                    # Model
    args = args,                      # Hyperparameters
    train_dataset = train_data,       # Datasets
    eval_dataset = val_data,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True),
    #load_best_model_at_end = True,    # Save the best model after training
    #save_strategy = 'steps',
)

In [None]:
print("Start Training...")
print()
trainer.train()

### 2. Testing

- Load the trained model

In [None]:
# Base LLM
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,trust_remote_code=True)

# Trained Model
trained_path = ''
cultural_model = PeftModel.from_pretrained(model=model,model_id=trained_path)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False,trust_remote_code=True)
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

- Zero-shot Inference

In [None]:
cultural_model = cultural_model.cuda()    # to GPU
question = ""

model_inference(question,cultural_model,tokenizer)