In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd

# Read sentence for each article csv file.

In [2]:
df_train_transcript = pd.read_csv('./MERGE/TRAIN_ALL.csv')
df_train_transcript = df_train_transcript.to_dict(orient='dict')
TRAIN_ESENT = (list(df_train_transcript['EN'].values()))
TRAIN_KSENT = (list(df_train_transcript['KO'].values()))
TRAIN_ZSENT = (list(df_train_transcript['ZH-TW'].values()))
df_train_transcript = []

In [3]:
df_test_transcript = pd.read_csv('./MERGE/TEST_ALL.csv')
df_test_transcript = df_test_transcript.to_dict(orient='dict')
TEST_ESENT = (list(df_test_transcript['EN'].values()))
TEST_KSENT = (list(df_test_transcript['KO'].values()))
TEST_ZSENT = (list(df_test_transcript['ZH-TW'].values()))
df_test_transcript = []

# Pretrain Model

In [4]:
import torch
from transformers import BertTokenizer, BertModel, BertConfig, BertForPreTraining, AutoModel, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertForMaskedLM
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import time

In [5]:
# 設定訓練參數
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_length = 64
batch_size = 16
#batch_size = 32
learning_rate = 1e-4
num_epochs = 50

In [6]:
# 加載預訓練的BERT模型和分詞器
#tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#bert_model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

In [7]:
zh_tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
#zh_bert_model = AutoModel.from_pretrained('ckiplab/bert-base-chinese')

In [8]:
ko_tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
ko_bert_model = BertModel.from_pretrained("kykim/bert-kor-base")

In [9]:
# 假設您有一個平行語料庫，這是您的資料集
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.length = len(source_sentences)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        source = self.source_sentences[index]
        target = self.target_sentences[index]
        return source, target


In [10]:
# 建立模型
class TranslationModel(nn.Module):
    def __init__(self, bert_model):
        super(TranslationModel, self).__init__()
        self.bert = bert_model
        self.linear = nn.Linear(self.bert.config.hidden_size, ko_tokenizer.vocab_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        prediction_scores = self.linear(outputs.last_hidden_state)
        return prediction_scores


In [11]:
# 載入資料集
train_source_sentences = TRAIN_ZSENT  # 中文句子
train_target_sentences = TRAIN_KSENT  # 韓文句子
dataset = TranslationDataset(train_source_sentences, train_target_sentences)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [12]:
test_source_sentences = TEST_ZSENT  # 中文句子
test_target_sentences = TEST_KSENT  # 韓文句子
test_dataset = TranslationDataset(test_source_sentences, test_target_sentences)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [13]:
# 初始化模型、損失函數和優化器
model = TranslationModel(ko_bert_model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
# 加載訓練好的模型
#model = TranslationModel(zh_bert_model).to(device)
model.load_state_dict(torch.load('./MODELS/translation_model(ZK4_ko).pth'))

<All keys matched successfully>

In [15]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time%60)
  return elapsed_mins, elapsed_secs

In [16]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable


In [17]:
from tqdm import tqdm

In [18]:
def train(path):
    global model, dataloader, optimizer, criterion, zh_tokenizer, ko_tokenizer
    model.train()
    total_loss = 0
    with tqdm(total = len(dataloader)) as pbar:
      for batch in dataloader:
        source_batch, target_batch = batch
        source_tokens = zh_tokenizer(source_batch, padding='max_length', truncation=True, return_tensors='pt', max_length=max_length).to(device)
        target_tokens = ko_tokenizer(target_batch, padding='max_length', truncation=True, return_tensors='pt', max_length=max_length).to(device)

        optimizer.zero_grad()
        output = model(input_ids=source_tokens.input_ids, attention_mask=source_tokens.attention_mask)
        loss = criterion(output.transpose(1, 2), target_tokens.input_ids)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.update(1)
    torch.save(model.state_dict(), path)

    return total_loss/len(dataloader)

In [19]:
def evaluate():
  global model, test_dataloader, criterion, zh_tokenizer, ko_tokenizer
  model.eval()
  total_loss = 0
  with torch.no_grad():
    with tqdm(total = len(test_dataloader)) as pbar:
      for batch in test_dataloader:
        source_batch, target_batch = batch
        source_tokens = zh_tokenizer(source_batch, padding='max_length', truncation=True, return_tensors='pt', max_length=max_length).to(device)
        target_tokens = ko_tokenizer(target_batch, padding='max_length', truncation=True, return_tensors='pt', max_length=max_length).to(device)

        output = model(input_ids=source_tokens.input_ids, attention_mask=source_tokens.attention_mask)
        loss = criterion(output.transpose(1, 2), target_tokens.input_ids)

        total_loss += loss.item()
        pbar.update(1)

  return total_loss/len(test_dataloader)

In [20]:
def loss_history(train_loss_values, test_loss_values, path='', to_show=False):
  before_train_loss = []
  before_test_loss = []
  try:
    df_loss = pd.read_csv('./MODELS/LOSS_HISTORY/loss_plot_{}.csv'.format(path))
    df_loss.to_dict(orient='dict')

    try:
      before_train_loss = list(df_loss['TRAIN_LOSS'].values())
      before_test_loss = list(df_loss['TEST_LOSS'].values())
    except:
      before_train_loss = list(df_loss['TRAIN_LOSS'])
      before_test_loss = list(df_loss['TEST_LOSS'])


  except:
    before_train_loss = []
    before_test_loss = []

  if(len(before_train_loss)>0):
    train_loss_values = before_train_loss + train_loss_values
    test_loss_values = before_test_loss + test_loss_values

  # 假設您有每個 epoch 的索引
  epochs = range(1, len(train_loss_values) + 1)
  plt.clf()

  # 繪製訓練損失和測試損失的折線圖
  plt.plot(epochs, train_loss_values, 'b', label='Training loss')
  plt.plot(epochs, test_loss_values, 'r', label='Test loss')
  plt.title('Training and Test Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()

  # 儲存折線圖
  plt.savefig('./MODELS/LOSS_HISTORY/loss_plot_{}.png'.format(path))
  if to_show:
    plt.show()

  df_loss = pd.DataFrame({'TRAIN_LOSS':train_loss_values,'TEST_LOSS':test_loss_values})
  df_loss.to_csv('./MODELS/LOSS_HISTORY/loss_plot_{}.csv'.format(path),index=False)

In [21]:
def bleu_history(bleu_score_values, path='', to_show=False):
  before_bleu_score = []
  try:
    df_loss = pd.read_csv('./MODELS/LOSS_HISTORY/blue_plot_{}.csv'.format(path))
    df_loss.to_dict(orient='dict')

    try:
      before_bleu_score = list(df_loss['BLEU_SCORE'].values())
    except:
      before_bleu_score = list(df_loss['BLEU_SCORE'])


  except:
    before_bleu_score = []

  if(len(before_bleu_score)>0):
    bleu_score_values = before_bleu_score + bleu_score_values

  # 假設您有每個 epoch 的索引
  epochs = range(1, len(bleu_score_values) + 1)
  plt.clf()

  # 繪製訓練損失和測試損失的折線圖
  plt.plot(epochs, bleu_score_values, 'b', label='BLEU score')
  plt.title('BLEU SCORE')
  plt.xlabel('Epochs')
  plt.ylabel('Score')
  plt.legend()

  # 儲存折線圖
  plt.savefig('./MODELS/LOSS_HISTORY/blue_plot_{}.png'.format(path))
  if to_show:
    plt.show()

  df_loss = pd.DataFrame({'BLEU_SCORE':bleu_score_values})
  df_loss.to_csv('./MODELS/LOSS_HISTORY/blue_plot_{}.csv'.format(path),index=False)

In [22]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

def bleu_score():
  global zh_tokenizer, ko_tokenizer, test_target_sentences, model, max_length, device
  model.eval()
  # 生成目標語句
  generated_sentences = []
  with tqdm(total = len(test_source_sentences)) as pbar:
    for source_sentence in test_source_sentences:
      source_tokens = zh_tokenizer(source_sentence, padding=True, truncation=True, return_tensors='pt', max_length=max_length).to(device)
      with torch.no_grad():
        output = model(input_ids=source_tokens.input_ids, attention_mask=source_tokens.attention_mask)
      generated_sentence = ko_tokenizer.decode(output[0].argmax(dim=-1), skip_special_tokens=True)
      generated_sentences.append(generated_sentence)
      pbar.update(1)


  # 計算 BLEU 分數
  smoothie = SmoothingFunction().method4
  bleu_score = corpus_bleu([[ref] for ref in test_target_sentences], generated_sentences, smoothing_function=smoothie)

  #print("BLEU 分數:", bleu_score)
  return bleu_score

In [23]:
!pip install rouge

Defaulting to user installation because normal site-packages is not writeable


In [24]:
from rouge import Rouge
def rouge_score():
  global zh_tokenizer, ko_tokenizer, test_target_sentences, model, max_length, device
  total_score = 0
  model.eval()
  # 生成目標語句
  generated_sentences = []
  rouge = Rouge()
  with tqdm(total = len(test_source_sentences)) as pbar:
    for i in range(len(test_source_sentences)):
      source_tokens = zh_tokenizer(test_source_sentences[i], padding=True, truncation=True, return_tensors='pt', max_length=max_length).to(device)
      with torch.no_grad():
        output = model(input_ids=source_tokens.input_ids, attention_mask=source_tokens.attention_mask)
      generated_sentence = ko_tokenizer.decode(output[0].argmax(dim=-1), skip_special_tokens=True)
      try:
        scores = rouge.get_scores(generated_sentences, test_target_sentences[i], avg=True)
      except:
        scores = 0
      #print(generated_sentence)
      for metric, score in scores.items():
        print(f"{metric}: {score}")
      total_score += scores
      pbar.update(1)

  avg_score = total_score/len(test_source_sentences)
  print('Rouge Score:',avg_score)
  return avg_score

In [25]:
best_loss = 2.100
best_score = 0.119

In [26]:
train_losses = []
test_losses = []

In [27]:
# 訓練模型
for epoch in range(num_epochs):
    model_name = 'ZK4_ko'
    train_losses = []
    test_losses = []
    bleu_scores = []

    start_time = time.time()
    train_loss = train(path='./MODELS/translation_model({}).pth'.format(model_name))
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch {epoch+1:02} | Train Time: {epoch_mins}m {epoch_secs}s')

    start_time = time.time()
    test_loss = evaluate()

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch {epoch+1:02} | Test Time: {epoch_mins}m {epoch_secs}s')

    start_time = time.time()
    score = bleu_score()
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch {epoch+1:02} | Eval Time: {epoch_mins}m {epoch_secs}s')

    # 儲存訓練好的模型
    #if(score > best_score):
    if(test_loss < best_loss):
      best_score = score
      best_loss = test_loss
      torch.save(model.state_dict(), './MODELS/best_translation_model({}).pth'.format(model_name))
      print('== save model ==')
    print('\tTrain Loss: {:.3f} | Test Loss: {:.3f}'.format(train_loss, test_loss))
    print('\tBLEU Score: {:.3f}'.format(score))

    if(best_score < 0.001):
      try:
        model.load_state_dict(torch.load('./MODELS/best_translation_model(ZK4_ko).pth'))
      except:
        'nothing'

    train_losses.append(train_loss)
    test_losses.append(test_loss)
    bleu_scores.append(score)

    loss_history(train_losses, test_losses, path=model_name, to_show=False)
    bleu_history(bleu_scores, path=model_name, to_show=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.47it/s]


Epoch 01 | Train Time: 8m 50s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.31it/s]


Epoch 01 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:57<00:00, 203.52it/s]


Epoch 01 | Eval Time: 3m 0s
	Train Loss: 1.070 | Test Loss: 2.722
	BLEU Score: 0.116


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.47it/s]


Epoch 02 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.54it/s]


Epoch 02 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:59<00:00, 200.90it/s]


Epoch 02 | Eval Time: 3m 2s
	Train Loss: 1.035 | Test Loss: 2.745
	BLEU Score: 0.117


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 03 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.50it/s]


Epoch 03 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:57<00:00, 203.18it/s]


Epoch 03 | Eval Time: 3m 0s
	Train Loss: 1.007 | Test Loss: 2.771
	BLEU Score: 0.122


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 04 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.47it/s]


Epoch 04 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [03:00<00:00, 199.88it/s]


Epoch 04 | Eval Time: 3m 3s
	Train Loss: 0.982 | Test Loss: 2.813
	BLEU Score: 0.123


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 05 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.51it/s]


Epoch 05 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:58<00:00, 201.91it/s]


Epoch 05 | Eval Time: 3m 1s
	Train Loss: 0.956 | Test Loss: 2.854
	BLEU Score: 0.125


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 06 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.46it/s]


Epoch 06 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:58<00:00, 201.65it/s]


Epoch 06 | Eval Time: 3m 2s
	Train Loss: 0.933 | Test Loss: 2.862
	BLEU Score: 0.128


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.48it/s]


Epoch 07 | Train Time: 8m 49s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.43it/s]


Epoch 07 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:58<00:00, 202.70it/s]


Epoch 07 | Eval Time: 3m 1s
	Train Loss: 0.910 | Test Loss: 2.919
	BLEU Score: 0.126


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 08 | Train Time: 8m 47s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.48it/s]


Epoch 08 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [03:01<00:00, 198.57it/s]


Epoch 08 | Eval Time: 3m 4s
	Train Loss: 0.887 | Test Loss: 2.916
	BLEU Score: 0.133


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.49it/s]


Epoch 09 | Train Time: 8m 50s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.44it/s]


Epoch 09 | Test Time: 0m 36s


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36089/36089 [02:59<00:00, 200.55it/s]


Epoch 09 | Eval Time: 3m 3s
	Train Loss: 0.867 | Test Loss: 2.963
	BLEU Score: 0.133


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9167/9167 [08:44<00:00, 17.48it/s]


Epoch 10 | Train Time: 8m 48s


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2256/2256 [00:36<00:00, 62.44it/s]


Epoch 10 | Test Time: 0m 36s


 18%|████████████████████▍                                                                                            | 6518/36089 [00:32<02:26, 201.57it/s]

In [None]:
bleu_score(test_source_sentences,model,max_length,device)