# 掛載 Google 雲端硬碟

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
current_dir = '/content/drive/MyDrive/tutorial/BERT'
print(current_dir)
model_path = os.path.join(current_dir,'model')
if os.path.exists(model_path) == False:
      os.makedirs(model_path)

/content/drive/MyDrive/tutorial/bert


# 載入模型所需套件

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel

from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

import pandas as pd
import json

# 讀取訓練資料及測試資料

In [None]:

def data_load(data_path):
  with open(data_path) as f:
    data = json.load(f)
  id_list, valence_list, arousal_list, sentence_list, aspect_list = [], [], [], [], []
  for sentence in data:
      if len(set(sentence['Aspect'])) == len(sentence['Aspect']):
          for num,aspect in enumerate(sentence['Aspect']):
              id_list.append(sentence['ID'])
              valence_list.append(float(sentence['Intensity'][num].split('#')[0]))
              arousal_list.append(float(sentence['Intensity'][num].split('#')[1]))
              aspect_list.append(sentence['Aspect'][num])
              sentence_list.append(sentence['Sentence'])
  data_dict = {'ID':id_list,'Text':sentence_list,'Aspect':aspect_list,'Valence':valence_list,'Arousal':arousal_list}
  return pd.DataFrame(data_dict)

In [None]:
train_data = data_load(os.path.join(current_dir,'SIGHAN2024_dimABSA_TrainingSet1_Traditional.json'))
test_data = data_load(os.path.join(current_dir,'SIGHAN2024_dimABSA_Testing_Task1_Traditional.json'))

In [None]:
def preprocess_data(data, tokenizer, option, max_length=128):
    processed_sentences = []
    for i in range((len(data))):
        # 將句子與額Aspect徵用 [SEP] 隔開
        processed_sentence = f"[CLS]{data.iloc[i]['Text']}[SEP]{data.iloc[i]['Aspect']}[SEP]"
        processed_sentences.append(processed_sentence)

    # Tokenizer 編碼
    inputs = tokenizer(
        processed_sentences,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    intensity = torch.tensor(data[option].to_list(), dtype=torch.float32).unsqueeze(1)
    return inputs['input_ids'], inputs['attention_mask'], intensity

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
option = 'Valence'

input_ids, attention_mask, intensity = preprocess_data(
    data=train_data,
    tokenizer=tokenizer,
    option = option
    )

train_dataset = TensorDataset(input_ids, attention_mask, intensity)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

input_ids, attention_mask, test_intensity = preprocess_data(
    data=test_data,
    tokenizer=tokenizer,
    option = option
    )
test_dataset = TensorDataset(input_ids, attention_mask, test_intensity)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

# 建立 BERT 模型

In [None]:
# 定義模型
class BERT_MLP(nn.Module):
  def __init__(self, bert_model_name='bert-base-chinese', hidden_size=128):
      super(BERT_MLP, self).__init__()
      self.bert = BertModel.from_pretrained(bert_model_name)
      self.mlp = nn.Sequential(
          nn.Linear(self.bert.config.hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, 1)
      )

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      cls_output = outputs.last_hidden_state[:, 0, :]
      intensity = self.mlp(cls_output)
      return intensity

In [None]:
 #建立模型
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
model = BERT_MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()
model_save_path = os.path.join(model_path,f"{option}_best_model.pth")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

# 訓練模型

In [None]:
best_val_loss = float('inf')
epochs = 1
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for input_ids, attention_mask, labels in tqdm(train_loader):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}")

    # 儲存最佳模型
    if train_loss < best_val_loss:
        best_val_loss = train_loss
        torch.save(model.state_dict(), model_save_path)
        print(f"Best model saved at epoch {epoch + 1}")

100%|██████████| 94/94 [00:27<00:00,  3.46it/s]


Epoch 1/1, Train Loss: 4.8227
Best model saved at epoch 1


# 測試模型

In [None]:
model.load_state_dict(torch.load(model_save_path))
model.eval()
outputs = []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        output = model(input_ids, attention_mask)
        outputs+=(output)

outputs = torch.cat(outputs)

predict_file = open(os.path.join(model_path,f'{option}_predict.txt'),'w')
for y,pred in zip(test_intensity, outputs):
  print(y.item(),pred.item(),file=predict_file)
predict_file.close()

In [None]:
mae = mean_absolute_error(test_intensity.squeeze().cpu().numpy(), outputs.cpu().numpy())
pr = pearsonr(test_intensity.squeeze().cpu().numpy() , outputs.cpu().numpy())[0]

print('MAE: %.3f' % (mae))
print('Pearsonr: %.3f' % (pr))

score = open(os.path.join(model_path,f'{option}_score.txt'),'w')
score.write('MAE: %.3f' % (mae)+'\n')
score.write('Pearsonr: %.3f' % (pr)+'\n')
score.close()

MAE: 0.778
Pearsonr: 0.053
