In [1]:
!pip install datasets # HuggingFace Datasets
#!pip install wandb #如果要使用wandb請把相關註解打開
#import wandb
#wandb.login()



In [2]:
import torch

# We need to first test if the Colab environment is already in the use of GPU.
# Otherwise, we might restart the Colab environment later.
torch.cuda.is_available() # The printed result must be `True`.

False

In [3]:
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pandas as pd


In [4]:
import random
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

In [5]:
#!pip list

Package                               Version
------------------------------------- -------------------
absl-py                               1.4.0
accelerate                            1.6.0
aiohappyeyeballs                      2.6.1
aiohttp                               3.11.15
aiosignal                             1.3.2
alabaster                             1.0.0
albucore                              0.0.24
albumentations                        2.0.6
ale-py                                0.11.0
altair                                5.5.0
annotated-types                       0.7.0
anyio                                 4.9.0
argon2-cffi                           23.1.0
argon2-cffi-bindings                  21.2.0
array_record                          0.7.2
arviz                                 0.21.0
astropy                               7.0.1
astropy-iers-data                     0.2025.4.28.0.37.27
astunparse                            1.6.3
atpublic                              5

In [45]:
DATA_NAME = "dair-ai/emotion"
MODEL_NAME = "bert-base-uncased" # You can try other models.
#MODEL_NAME = "distilbert-base-uncased"


In [46]:
train_data = load_dataset(DATA_NAME, split="train")
valid_data = load_dataset(DATA_NAME, split="validation")
test_data = load_dataset(DATA_NAME, split="test")

In [47]:
# Check the classes in the dataset
print(np.unique(train_data["label"]))

num_labels = len(np.unique(train_data["label"]))

[0 1 2 3 4 5]


In [48]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [49]:
# 檢查特殊 token ID (special token ids)
for value in tokenizer.special_tokens_map.values():
    print(f"token {value} 的 ID 為：{tokenizer.convert_tokens_to_ids(value)}")

token [UNK] 的 ID 為：100
token [SEP] 的 ID 為：102
token [PAD] 的 ID 為：0
token [CLS] 的 ID 為：101
token [MASK] 的 ID 為：103


In [50]:
# TODO1: Pre-process sentences with `tokenizer`.
# You should set up the parameter for `tokenizer`
# to cut off the excessive sentences that are longer and the max length of BERT.

# Please note that we don't need to perform padding at this step,
# because we will perform dynamic padding later with DataCollator.

def preprocess_function(examples):
    return tokenizer(examples['text'],truncation=True,padding=False)#動態padding
    #return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)  #靜態padding

In [51]:
encoded_train = train_data.map(preprocess_function, batched=True, remove_columns=["text"])
encoded_valid = valid_data.map(preprocess_function, batched=True, remove_columns=["text"])
encoded_test = test_data.map(preprocess_function, batched=True, remove_columns=["text"])

# Please note that`batched` != batch_size
# `batched=True` means to process data in batches with the map function
# Usually, `batched=True` computes faster.

In [52]:
# Observation with a tokenized example

first_token_ids = encoded_test['input_ids'][0]
print(f"第一筆資料被轉換成 IDs 的結果: {first_token_ids}")
print(f"把 IDs 換回原本文字: {tokenizer.decode(first_token_ids, skip_special_tokens=False)}")
print(f"原始文字: {test_data['text'][0]}")

# You can also set `padding=True` in `preprocess_function` to observe the difference.

第一筆資料被轉換成 IDs 的結果: [101, 10047, 3110, 2738, 11083, 2061, 10047, 2025, 2200, 12479, 2157, 2085, 102]
把 IDs 換回原本文字: [CLS] im feeling rather rotten so im not very ambitious right now [SEP]
原始文字: im feeling rather rotten so im not very ambitious right now


In [53]:
# Set up DataCollator for dynamic padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [54]:
# Observation: dynamic padding with the DataCollator
# The lengths are differnt in different batches.

tmp_batch1 = data_collator(encoded_test[0:5])
print(len(tmp_batch1["input_ids"][0]))

tmp_batch2 = data_collator(encoded_test[5:10])
print(len(tmp_batch2["input_ids"][0]))

25
44


In [55]:
# Set up the BERT model for sequence classification

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
# TODO2: Please finish `TrainingArguments` 用來放超參數

training_args = TrainingArguments(

    output_dir="distilbert-base-uncased_dynamic/results",
    run_name="distilbert-base-uncased_dynamic",
    # Write your code here
    num_train_epochs=3,
    learning_rate=2e-5,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,

    warmup_steps=500,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    save_total_limit=10,
    logging_dir="distilbert-base-uncased_dynamic/logs",
    logging_steps=10,
    seed=random_seed,
    report_to='none'
    #report_to='wandb' # You can use wandb
)
"""wandb.init(
    project="CGU_DL_HW3",
    name="bert-base-uncased-static",
    reinit=True
)"""

'wandb.init(\n    project="CGU_DL_HW3",\n    name="bert-base-uncased-static",\n    reinit=True\n)'

In [57]:
def compute_metrics(model_eval_pred):
    preds = model_eval_pred.predictions.argmax(axis=1)
    labels = model_eval_pred.label_ids

    # TODO3: Write scoring functions to get acc, precision, recall, and f1-score.
    # Write your code here 進行模型評估
    precision,recall,fl,_=precision_recall_fscore_support(labels,preds,average='weighted')
    acc=accuracy_score(labels,preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'fl': fl
    }

In [58]:
# TODO4: Please finish `Trainer`

trainer = Trainer(
    # Write your code here
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_valid,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [59]:
# Use 1 GPU for training
trainer.args._n_gpu=1

In [61]:
trainer.train() #開始訓練

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate(encoded_test)

In [None]:
print(metrics)

In [None]:
df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
print(df.to_markdown(index=False))