In [None]:
! pip install datasets
! pip install evaluate
! pip install sacrebleu
! pip install bert_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import evaluate
from tqdm.auto import tqdm

In [None]:
data_path = r'/content/drive/MyDrive/2024Spring/641NaturalLanguageProcessing/NewFolder/dataset.json'
model_path = r'/content/drive/MyDrive/2024Spring/641NaturalLanguageProcessing/NewFolder/model_state_dict.pth'

In [None]:
%run "/content/drive/MyDrive/2024Spring/641NaturalLanguageProcessing/NewFolder/code/encoder_decoder.ipynb"

In [None]:
data_files = {'train':data_path}
data = load_dataset('json',data_files=data_files)
data = data['train'].train_test_split(train_size=0.99,shuffle=True)
tokenizer_en = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer_zh = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
data.set_format('pandas')
dataset_train = data['train'][:][['tokenized_en_id','tokenized_zh_id']]
dataset_train = dataset_train.rename(columns={'tokenized_en_id':'english'})
dataset_train = dataset_train.rename(columns={'tokenized_zh_id':'chinese'})
dataset_val = data['test'][:][['tokenized_en_id','tokenized_zh_id']]
dataset_val = dataset_val.rename(columns={'tokenized_en_id':'english'})
dataset_val = dataset_val.rename(columns={'tokenized_zh_id':'chinese'})
dataset_train.head()

Generating train split: 0 examples [00:00, ? examples/s]

Unnamed: 0,english,chinese
0,"[101, 1188, 7108, 1110, 1208, 6241, 1272, 4194...","[101, 2190, 6821, 671, 6237, 7025, 671, 4684, ..."
1,"[101, 146, 1577, 112, 189, 1920, 2008, 1164, 1...","[101, 2769, 2190, 6821, 702, 671, 1403, 679, 2..."
2,"[101, 1124, 2085, 170, 1289, 4330, 5264, 1166,...","[101, 800, 4558, 958, 1765, 4500, 2797, 2851, ..."
3,"[101, 1284, 112, 1325, 13967, 1128, 1106, 1129...","[101, 809, 1400, 4692, 3040, 2805, 6756, 3683,..."
4,"[101, 1398, 146, 1180, 1202, 1108, 1198, 3465,...","[101, 2769, 1372, 5543, 1777, 1762, 6929, 511,..."


In [None]:
# hyperparameters
# model
vocab_size_en = len(tokenizer_en.vocab)
vocab_size_zh = len(tokenizer_zh.vocab)
print(vocab_size_en,vocab_size_zh)
max_length = 512        # max length of the input sequence
n_emb = 512             # embedding size
n_head = 8              # number of heads in multi-head attention
head_size = 64          # number of 'features' output by a single-head self-attention
n_blocks = 3            # number of blocks in a encoder or decoder
n_hidden = 1024
assert head_size*n_head == n_emb, ''

# training
num_epochs = 15
batch_size = 128
learning_rate = 8e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

28996 21128


In [None]:
class textData(Dataset):
  def __init__(self,data):
    super().__init__()
    self.data = data

  def __getitem__(self, index):
    chinese_sentence = torch.tensor(self.data['chinese'][index])
    english_sentence = torch.tensor(self.data['english'][index])
    return english_sentence,chinese_sentence,

  def __len__(self):
    return len(self.data)

In [None]:
def collate(batch):
  input_embeddings = [item[0] for item in batch]
  targets = [item[1] for item in batch]

  input_padded = pad_sequence(input_embeddings, batch_first=True, padding_value=tokenizer_en.pad_token_id)
  attention_mask_input = (torch.ones((input_padded.shape[1],input_padded.shape[1]))==0)
  attention_mask_input_padding = (input_padded == tokenizer_en.pad_token_id)

  target_padded = pad_sequence(targets, batch_first=True, padding_value=tokenizer_zh.pad_token_id)
  size = target_padded.shape[1] - 1
  attention_mask_target = (torch.tril(torch.ones(size,size)) == 0)
  attention_mask_target_padding = (target_padded[:,:-1] == tokenizer_zh.pad_token_id)

  return {'input':input_padded,'attention_mask_input':attention_mask_input,'attention_mask_input_padding':attention_mask_input_padding,
          'target':target_padded,'attention_mask_target':attention_mask_target,'attention_mask_target_padding':attention_mask_target_padding}

In [None]:
data_train = textData(dataset_train)
dataloader_train = DataLoader(data_train,batch_size=batch_size,shuffle=True,collate_fn=collate)
data_val = textData(dataset_val)
dataloader_val = DataLoader(data_val,batch_size=1,shuffle=True,collate_fn=collate)

In [None]:
model = TorchTransformer(n_emb,head_size,n_head,n_blocks,vocab_size_en,vocab_size_zh,n_hidden,max_length).to(device)
for p in model.parameters():
  if p.dim() > 1:
    nn.init.kaiming_uniform_(p, nonlinearity='relu')
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer_en.pad_token_id)
optim = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
schedular = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=num_epochs)

In [None]:
print('parameters of this model: ',sum(p.numel() for p in model.parameters()))
print('number of examples in training set: ',len(data_train), ', in validation set: ',len(data_val))
num_training_steps = len(dataloader_train)*num_epochs
metric_bleu = evaluate.load('sacrebleu')
metric_bert = evaluate.load('bertscore')

parameters of this model:  52275848
number of examples in training set:  99000 , in validation set:  1000


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
start = time.time()
model.eval()
progress_bar = tqdm(dataloader_val)
for batch in progress_bar:
  with torch.no_grad():
    input_ids = batch['input'].to(device)
    attention_mask_input = batch['attention_mask_input'].to(device)
    attention_mask_input_padding = batch['attention_mask_input_padding'].to(device)

    output = model.generate(input_ids,attention_mask_input,attention_mask_input_padding,device=device)
    output = tokenizer_zh.decode(output,skip_special_tokens=True)
    targets = batch['target'].to(device)
    targets = tokenizer_zh.decode(targets[0],skip_special_tokens=True)
    metric_bleu.add_batch(predictions=[output],references=[targets])
    metric_bert.add_batch(predictions=[output],references=[targets])

    progress_bar.update(1)
result_bleu = metric_bleu.compute()
result_bert = metric_bert.compute(lang='zh')
print(result_bleu)
print(result_bert)
print('time: ',time.time()-start)

  0%|          | 0/1000 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

{'score': 0.00016440325194080712, 'counts': [3, 0, 0, 0], 'totals': [284519, 283519, 282529, 281542], 'precisions': [0.0010544111289579959, 0.00017635502382556373, 8.848649165218438e-05, 4.439834909178737e-05], 'bp': 1.0, 'sys_len': 284519, 'ref_len': 18711}
{'precision': [0.5781648755073547, 0.24324005842208862, 0.23829790949821472, 0.16423892974853516, 0.28232264518737793, 0.4003729820251465, 0.2822442352771759, 0.2004343867301941, 0.31197798252105713, 0.16041360795497894, 0.23542329668998718, 0.25404173135757446, 0.22069227695465088, 0.2812125086784363, 0.27945297956466675, 0.2523030936717987, 0.2833980321884155, 0.22686046361923218, 0.15816543996334076, 0.25783228874206543, 0.26295918226242065, 0.24981343746185303, 0.2466575801372528, 0.4851374626159668, 0.23823250830173492, 0.3100055456161499, 0.2744683623313904, 0.29015377163887024, 0.22502091526985168, 0.2581726014614105, 0.2029334306716919, 0.14556315541267395, 0.25220736861228943, 0.2691265940666199, 0.11037871241569519, 0.203

In [None]:
load_model = False
if load_model:
  model.load_state_dict(torch.load(model_path))
else:
  count = 0
  lossi = []
  start = time.time()
  for epoch in range(num_epochs):
    # train
    model.train()
    for batch in dataloader_train:
      inputs = batch['input'].to(device)
      attention_mask_input = batch['attention_mask_input'].to(device)
      attention_mask_input_padding = batch['attention_mask_input_padding'].to(device)
      targets = batch['target'].to(device)
      attention_mask_target = batch['attention_mask_target'].to(device)
      attention_mask_target_padding = batch['attention_mask_target_padding'].to(device)
      y_pred = model(inputs,targets[:,:-1],
                     attention_mask_input,attention_mask_target,
                     attention_mask_input_padding,attention_mask_target_padding,
                     attention_mask_input_padding)

      loss = loss_fn(y_pred.transpose(1,2), targets[:,1:])
      optim.zero_grad()
      loss.backward()
      optim.step()

      lossi.append(loss.item())
      count += 1
      if count % 200 == 0:
        print('loss: ',loss.item())
    schedular.step()
    print('epoch: ',epoch,' loss: ',loss.item())
  # post-process after training
  torch.save(model.state_dict(), model_path)
  end = time.time()
  print(f'time for {num_epochs} epoches: ',end-start)
  plt.plot(lossi)

In [None]:
start = time.time()
model.eval()
progress_bar = tqdm(dataloader_val)
for batch in progress_bar:
  with torch.no_grad():
    input_ids = batch['input'].to(device)
    attention_mask_input = batch['attention_mask_input'].to(device)
    attention_mask_input_padding = batch['attention_mask_input_padding'].to(device)

    output = model.generate(input_ids,attention_mask_input,attention_mask_input_padding,device=device)
    output = tokenizer_zh.decode(output,skip_special_tokens=True)
    targets = batch['target'].to(device)
    targets = tokenizer_zh.decode(targets[0],skip_special_tokens=True)
    metric_bleu.add_batch(predictions=[output],references=[targets])
    metric_bert.add_batch(predictions=[output],references=[targets])

    progress_bar.update(1)
result_bleu = metric_bleu.compute()
result_bert = metric_bert.compute(lang='zh')
print(result_bleu)
print(result_bert)
print('time: ',time.time()-start)

In [None]:
model.eval()
def generate_fn(input_sentence):
  tokens = tokenizer_en.tokenize(input_sentence,add_special_tokens=True)
  inputs = tokenizer_en.convert_tokens_to_ids(tokens)
  input_ids = torch.tensor(inputs).unsqueeze(0).to(device)
  attention_mask_input = (torch.ones([input_ids.shape[1],input_ids.shape[1]])==0).to(device)
  attention_mask_input_padding = (torch.ones(input_ids.shape[1])==0).unsqueeze(0).to(device)

  output = model.generate(input_ids,attention_mask_input,attention_mask_input_padding,device=device)
  output = tokenizer_zh.decode(output)
  return output

In [None]:
model.eval()
input_sentence = 'can you tell me the truth?'
output = generate_fn(input_sentence)
output