<a href="https://colab.research.google.com/github/Jane0731/Homework/blob/main/%E9%87%91%E8%9E%8D%E5%A4%A7%E6%95%B8%E6%93%9A_W10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
pip install transformers datasets torch



In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [9]:
# 設置設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# 1. 加載資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree", split="train")
texts = dataset["sentence"]
labels = dataset["label"]

The repository for takala/financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/takala/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [11]:
# 2. 切分訓練和驗證集
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [12]:
# 3. 加載 BERT 分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
# Tokenization
def preprocess_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    return encodings, torch.tensor(labels)

train_encodings, train_labels = preprocess_data(train_texts, train_labels)
val_encodings, val_labels = preprocess_data(val_texts, val_labels)

In [14]:
# 封裝為 DataLoader
class FinancialDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

In [15]:
train_dataset = FinancialDataset(train_encodings, train_labels)
val_dataset = FinancialDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [16]:
# 4. 加載 BERT 模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 classes: Positive, Neutral, Negative
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
# 5. 訓練設置
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)



In [18]:
# 6. 訓練模型
def train_model():
    model.train()
    for epoch in range(3):  # 訓練 3 個 Epoch
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            print(f"Epoch: {epoch}, Loss: {loss.item()}")

In [19]:
# 7. 驗證模型
def evaluate_model():
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"Validation Accuracy: {correct / total:.2f}")

In [20]:
# 8. 推論
def predict_sentiments(sentences):
    model.eval()
    encodings = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**encodings)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return [label_map[pred] for pred in preds]


In [21]:
# 執行
train_model()
evaluate_model()

# 測試推論
test_sentences = [
    "The company's profit has increased significantly this quarter.",
    "The increase in costs negatively affected the revenue.",
    "The company's performance remained stable."
]
predictions = predict_sentiments(test_sentences)
print(predictions)

Epoch: 0, Loss: 1.193615436553955
Epoch: 0, Loss: 1.1528668403625488
Epoch: 0, Loss: 1.1786421537399292
Epoch: 0, Loss: 1.068195104598999
Epoch: 0, Loss: 1.0791122913360596
Epoch: 0, Loss: 1.0092804431915283
Epoch: 0, Loss: 0.9987706542015076
Epoch: 0, Loss: 0.8508639335632324
Epoch: 0, Loss: 1.0161759853363037
Epoch: 0, Loss: 1.0573246479034424
Epoch: 0, Loss: 0.8521026372909546
Epoch: 0, Loss: 0.7401472330093384
Epoch: 0, Loss: 0.8023922443389893
Epoch: 0, Loss: 0.8138043880462646
Epoch: 0, Loss: 0.8528403043746948
Epoch: 0, Loss: 1.0233588218688965
Epoch: 0, Loss: 0.7179135680198669
Epoch: 0, Loss: 0.5526652336120605
Epoch: 0, Loss: 0.8177506923675537
Epoch: 0, Loss: 0.8667941093444824
Epoch: 0, Loss: 0.7934136390686035
Epoch: 0, Loss: 0.8136261701583862
Epoch: 0, Loss: 0.7212497591972351
Epoch: 0, Loss: 1.004144310951233
Epoch: 0, Loss: 0.7242832183837891
Epoch: 0, Loss: 0.6828690767288208
Epoch: 0, Loss: 0.8784319162368774
Epoch: 0, Loss: 0.7078714370727539
Epoch: 0, Loss: 0.84965