In [11]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 讀取 SICK.txt，並只保留 sentence_A, sentence_B, entailment_label
df = pd.read_csv("SICK.txt", sep="\t")
df = df[["sentence_A", "sentence_B", "entailment_label"]]
print("原始資料形狀:", df.shape)

# 可存檔整理後的資料（選用）
#df.to_csv("SICK_filtered.tsv", sep="\t", index=False)

# 切分資料（80% 訓練, 20% 測試），以 entailment_label 做 stratify
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["entailment_label"])
print("訓練集形狀:", df_train.shape, "測試集形狀:", df_test.shape)

原始資料形狀: (9840, 3)
訓練集形狀: (7872, 3) 測試集形狀: (1968, 3)


In [12]:

def tokenize(text):
    return text.lower().split()

# 建立詞彙（以訓練集兩個句子合併計算詞頻）
all_texts = df_train["sentence_A"].tolist() + df_train["sentence_B"].tolist()
from collections import Counter
counter = Counter()
for t in all_texts:
    counter.update(tokenize(t))
    
max_vocab_size = 10000
# 保留 <PAD> 與 <UNK>
vocab = {"<PAD>": 0, "<UNK>": 1}
for word, freq in counter.most_common(max_vocab_size - 2):
    vocab[word] = len(vocab)
vocab_size = len(vocab)
print("詞彙大小:", vocab_size)

def text_to_ids(text, vocab, max_length=50):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    if len(ids) < max_length:
        ids = ids + [vocab["<PAD>"]] * (max_length - len(ids))
    else:
        ids = ids[:max_length]
    return ids

# 設定最大序列長度
max_length = 50

詞彙大小: 2439


In [13]:
class SICKDataset(Dataset):
    def __init__(self, df, vocab, max_length):
        self.df = df.reset_index(drop=True)
        self.vocab = vocab
        self.max_length = max_length
        self.sentences_A = df["sentence_A"].tolist()
        self.sentences_B = df["sentence_B"].tolist()
        self.labels = df["entailment_label"].tolist()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        sent_A = self.sentences_A[idx]
        sent_B = self.sentences_B[idx]
        # 轉換成 token id 列表
        ids_A = torch.tensor(text_to_ids(sent_A, self.vocab, self.max_length), dtype=torch.long)
        ids_B = torch.tensor(text_to_ids(sent_B, self.vocab, self.max_length), dtype=torch.long)
        label = self.labels[idx]
        return ids_A, ids_B, label


le = LabelEncoder()
df_train["label_enc"] = le.fit_transform(df_train["entailment_label"])
df_test["label_enc"] = le.transform(df_test["entailment_label"])
num_classes = len(le.classes_)
print("標籤類別:", le.classes_)


def create_dataloader(df, vocab, max_length, batch_size, shuffle=True):
    dataset = SICKDataset(df, vocab, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

標籤類別: ['CONTRADICTION' 'ENTAILMENT' 'NEUTRAL']


In [14]:
class EmbeddingLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, max_length, dropout=0.5):
        super(EmbeddingLSTMClassifier, self).__init__()
        # 分別為 sentence_A 與 sentence_B 建立獨立的 Embedding 與 LSTM encoder
        self.embedding_A = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm_A = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        
        self.embedding_B = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm_B = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        
        # 全連接層：將兩個 LSTM 的最後隱藏狀態串接後輸入
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )
    
    def forward(self, ids_A, ids_B):
        # sentence_A branch
        emb_A = self.embedding_A(ids_A)  # shape: (batch, max_length, embed_dim)
        out_A, (h_A, _) = self.lstm_A(emb_A)  # h_A shape: (num_layers, batch, hidden_dim)
        # 取最末層隱藏狀態
        h_A = h_A[-1]  # shape: (batch, hidden_dim)
        
        # sentence_B branch
        emb_B = self.embedding_B(ids_B)  # shape: (batch, max_length, embed_dim)
        out_B, (h_B, _) = self.lstm_B(emb_B)
        h_B = h_B[-1]  # shape: (batch, hidden_dim)
        
        # 串接兩邊的向量（不相加）
        features = torch.cat([h_A, h_B], dim=1)  # shape: (batch, hidden_dim*2)
        
        logits = self.fc(features)
        return logits

In [18]:
def train_and_evaluate(model, train_loader, test_loader, num_epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    epoch_list = []
    
    for epoch in range(1, num_epochs+1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for ids_A, ids_B, labels in train_loader:
            # 轉換標籤為 tensor
            labels = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
            ids_A, ids_B = ids_A.to(device), ids_B.to(device)
            
            optimizer.zero_grad()
            outputs = model(ids_A, ids_B)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * ids_A.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        
        train_loss = running_loss / total
        train_acc = correct / total
        
        # 測試階段
        model.eval()
        test_running_loss = 0.0
        test_correct = 0
        test_total = 0
        with torch.no_grad():
            for ids_A, ids_B, labels in test_loader:
                labels = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
                ids_A, ids_B = ids_A.to(device), ids_B.to(device)
                outputs = model(ids_A, ids_B)
                loss = criterion(outputs, labels)
                test_running_loss += loss.item() * ids_A.size(0)
                _, preds = torch.max(outputs, 1)
                test_correct += (preds == labels).sum().item()
                test_total += labels.size(0)
        
        test_loss = test_running_loss / test_total
        test_acc = test_correct / test_total
        
        epoch_list.append(epoch)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")
    
    return epoch_list, train_losses, test_losses, train_accuracies, test_accuracies


batch_size = 32
num_epochs = 20
learning_rate = 0.0001

X = df_train.copy()

In [19]:
fractions = [1.0, 0.5, 0.25, 0.1]

for frac in fractions:
    num_samples = int(len(X) * frac)
    df_train_subset = X.iloc[:num_samples].reset_index(drop=True)
    subset_percentage = int(frac*100)
    
    train_loader = create_dataloader(df_train_subset, vocab, max_length, batch_size, shuffle=True)
    test_loader = create_dataloader(df_test, vocab, max_length, batch_size, shuffle=False)
    
    print(f"\n[比例 {subset_percentage}%] 訓練資料筆數: {len(df_train_subset)}")
    
    model = EmbeddingLSTMClassifier(vocab_size, embed_dim=128, hidden_dim=128, num_classes=num_classes, max_length=max_length, dropout=0.5)
    model.to(device)
    
    epoch_list, train_losses, test_losses, train_accs, test_accs = train_and_evaluate(model, train_loader, test_loader, num_epochs, learning_rate)
    
    # 畫圖
    fig, axs = plt.subplots(1, 2, figsize=(12,5))
    axs[0].plot(epoch_list, train_losses, label="Train Loss")
    axs[0].plot(epoch_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Loss")
    axs[0].set_title(f"Loss vs Epoch (Train Subset {subset_percentage}%)")
    axs[0].legend()
    
    axs[1].plot(epoch_list, train_accs, label="Train Acc")
    axs[1].plot(epoch_list, test_accs, label="Test Acc")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Epoch (Train Subset {subset_percentage}%)")
    axs[1].legend()
    
    plt.tight_layout()
    plt.savefig(f"EmbeddingLSTM_SICK_{subset_percentage}.png")
    plt.close()
    
    # 使用測試集評估最終模型並印出分類報告
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for ids_A, ids_B, labels in test_loader:
            labels_tensor = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
            ids_A, ids_B = ids_A.to(device), ids_B.to(device)
            outputs = model(ids_A, ids_B)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_tensor.cpu().numpy())
    
    print(f"分類報告 (Train Subset {subset_percentage}%):")
    print(classification_report(all_labels, all_preds, target_names=le.classes_))




[比例 100%] 訓練資料筆數: 7872
Epoch 1: Train Loss=0.9955, Train Acc=0.5683 | Test Loss=0.9603, Test Acc=0.5686
Epoch 2: Train Loss=0.9654, Train Acc=0.5686 | Test Loss=0.9588, Test Acc=0.5686
Epoch 3: Train Loss=0.9619, Train Acc=0.5686 | Test Loss=0.9556, Test Acc=0.5686
Epoch 4: Train Loss=0.9495, Train Acc=0.5686 | Test Loss=0.9367, Test Acc=0.5686
Epoch 5: Train Loss=0.9350, Train Acc=0.5686 | Test Loss=0.9407, Test Acc=0.5686
Epoch 6: Train Loss=0.9269, Train Acc=0.5686 | Test Loss=0.9284, Test Acc=0.5686
Epoch 7: Train Loss=0.9200, Train Acc=0.5686 | Test Loss=0.9218, Test Acc=0.5686
Epoch 8: Train Loss=0.9205, Train Acc=0.5686 | Test Loss=0.9150, Test Acc=0.5686
Epoch 9: Train Loss=0.9163, Train Acc=0.5687 | Test Loss=0.9199, Test Acc=0.5686
Epoch 10: Train Loss=0.9122, Train Acc=0.5722 | Test Loss=0.9094, Test Acc=0.5681
Epoch 11: Train Loss=0.9067, Train Acc=0.5804 | Test Loss=0.8971, Test Acc=0.5833
Epoch 12: Train Loss=0.8990, Train Acc=0.5854 | Test Loss=0.8820, Test Acc=0.5864
E

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1: Train Loss=1.0223, Train Acc=0.5650 | Test Loss=0.9594, Test Acc=0.5686
Epoch 2: Train Loss=0.9608, Train Acc=0.5704 | Test Loss=0.9590, Test Acc=0.5686
Epoch 3: Train Loss=0.9605, Train Acc=0.5704 | Test Loss=0.9596, Test Acc=0.5686
Epoch 4: Train Loss=0.9633, Train Acc=0.5704 | Test Loss=0.9596, Test Acc=0.5686
Epoch 5: Train Loss=0.9592, Train Acc=0.5704 | Test Loss=0.9623, Test Acc=0.5686
Epoch 6: Train Loss=0.9525, Train Acc=0.5704 | Test Loss=0.9502, Test Acc=0.5686
Epoch 7: Train Loss=0.9439, Train Acc=0.5704 | Test Loss=0.9680, Test Acc=0.5686
Epoch 8: Train Loss=0.9403, Train Acc=0.5704 | Test Loss=0.9548, Test Acc=0.5686
Epoch 9: Train Loss=0.9292, Train Acc=0.5704 | Test Loss=0.9521, Test Acc=0.5686
Epoch 10: Train Loss=0.9248, Train Acc=0.5704 | Test Loss=0.9582, Test Acc=0.5686
Epoch 11: Train Loss=0.9173, Train Acc=0.5704 | Test Loss=0.9552, Test Acc=0.5686
Epoch 12: Train Loss=0.9177, Train Acc=0.5704 | Test Loss=0.9622, Test Acc=0.5686
Epoch 13: Train Loss=0.91

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1: Train Loss=1.0932, Train Acc=0.3877 | Test Loss=1.0623, Test Acc=0.5686
Epoch 2: Train Loss=0.9998, Train Acc=0.5788 | Test Loss=0.9638, Test Acc=0.5686
Epoch 3: Train Loss=0.9605, Train Acc=0.5808 | Test Loss=0.9644, Test Acc=0.5686
Epoch 4: Train Loss=0.9560, Train Acc=0.5808 | Test Loss=0.9591, Test Acc=0.5686
Epoch 5: Train Loss=0.9518, Train Acc=0.5803 | Test Loss=0.9585, Test Acc=0.5686
Epoch 6: Train Loss=0.9543, Train Acc=0.5808 | Test Loss=0.9595, Test Acc=0.5686
Epoch 7: Train Loss=0.9555, Train Acc=0.5808 | Test Loss=0.9549, Test Acc=0.5686
Epoch 8: Train Loss=0.9492, Train Acc=0.5808 | Test Loss=0.9536, Test Acc=0.5686
Epoch 9: Train Loss=0.9357, Train Acc=0.5803 | Test Loss=0.9527, Test Acc=0.5686
Epoch 10: Train Loss=0.9239, Train Acc=0.5808 | Test Loss=0.9589, Test Acc=0.5686
Epoch 11: Train Loss=0.9289, Train Acc=0.5803 | Test Loss=0.9593, Test Acc=0.5686
Epoch 12: Train Loss=0.9104, Train Acc=0.5803 | Test Loss=0.9593, Test Acc=0.5686
Epoch 13: Train Loss=0.90

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2: Train Loss=1.0792, Train Acc=0.4587 | Test Loss=1.0636, Test Acc=0.5686
Epoch 3: Train Loss=1.0497, Train Acc=0.5515 | Test Loss=1.0286, Test Acc=0.5686
Epoch 4: Train Loss=1.0030, Train Acc=0.5616 | Test Loss=0.9613, Test Acc=0.5686
Epoch 5: Train Loss=0.9782, Train Acc=0.5616 | Test Loss=0.9602, Test Acc=0.5686
Epoch 6: Train Loss=0.9778, Train Acc=0.5616 | Test Loss=0.9650, Test Acc=0.5686
Epoch 7: Train Loss=0.9772, Train Acc=0.5616 | Test Loss=0.9599, Test Acc=0.5686
Epoch 8: Train Loss=0.9707, Train Acc=0.5616 | Test Loss=0.9595, Test Acc=0.5686
Epoch 9: Train Loss=0.9767, Train Acc=0.5616 | Test Loss=0.9601, Test Acc=0.5686
Epoch 10: Train Loss=0.9711, Train Acc=0.5616 | Test Loss=0.9602, Test Acc=0.5686
Epoch 11: Train Loss=0.9768, Train Acc=0.5616 | Test Loss=0.9616, Test Acc=0.5686
Epoch 12: Train Loss=0.9716, Train Acc=0.5616 | Test Loss=0.9597, Test Acc=0.5686
Epoch 13: Train Loss=0.9689, Train Acc=0.5616 | Test Loss=0.9586, Test Acc=0.5686
Epoch 14: Train Loss=0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(df_train["entailment_label"].value_counts())
print(df_test["entailment_label"].value_counts())


entailment_label
NEUTRAL          4476
ENTAILMENT       2257
CONTRADICTION    1139
Name: count, dtype: int64
entailment_label
NEUTRAL          1119
ENTAILMENT        564
CONTRADICTION     285
Name: count, dtype: int64
