In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 設定隨機種子與裝置
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


df = pd.read_csv("SICK_filtered.tsv", sep="\t")
print("原始資料形狀:", df.shape)

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["entailment_label"])
print("訓練集形狀:", df_train.shape, "測試集形狀:", df_test.shape)


def tokenize(text):
    return text.lower().split()

# 建立詞彙（以訓練集內所有句子合併計算詞頻）
all_texts = df_train["sentence_A"].tolist() + df_train["sentence_B"].tolist()
from collections import Counter
counter = Counter()
for t in all_texts:
    counter.update(tokenize(t))
    
max_vocab_size = 10000
# 保留 <PAD> 與 <UNK>
vocab = {"<PAD>": 0, "<UNK>": 1}
for word, freq in counter.most_common(max_vocab_size - 2):
    vocab[word] = len(vocab)
vocab_size = len(vocab)
print("詞彙大小:", vocab_size)

def text_to_ids(text, vocab, max_length=50):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    if len(ids) < max_length:
        ids = ids + [vocab["<PAD>"]] * (max_length - len(ids))
    else:
        ids = ids[:max_length]
    return ids

max_length = 50

In [None]:
class SICKDataset(Dataset):
    def __init__(self, df, vocab, max_length):
        self.df = df.reset_index(drop=True)
        self.vocab = vocab
        self.max_length = max_length
        self.sentences_A = df["sentence_A"].tolist()
        self.sentences_B = df["sentence_B"].tolist()
        self.labels = df["entailment_label"].tolist()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        sent_A = self.sentences_A[idx]
        sent_B = self.sentences_B[idx]
        # 轉換成 token id 列表
        ids_A = torch.tensor(text_to_ids(sent_A, self.vocab, self.max_length), dtype=torch.long)
        ids_B = torch.tensor(text_to_ids(sent_B, self.vocab, self.max_length), dtype=torch.long)
        label = self.labels[idx]
        return ids_A, ids_B, label


le = LabelEncoder()
df_train["label_enc"] = le.fit_transform(df_train["entailment_label"])
df_test["label_enc"] = le.transform(df_test["entailment_label"])
num_classes = len(le.classes_)
print("標籤類別:", le.classes_)

def create_dataloader(df, vocab, max_length, batch_size, shuffle=True):
    dataset = SICKDataset(df, vocab, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [None]:
class EmbeddingDNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, dropout=0.5):
        super(EmbeddingDNNClassifier, self).__init__()
        # 分別為 sentence_A 與 sentence_B 建立獨立的 Embedding 層
        self.embedding_A = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_B = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # 全連接層，輸入維度為 embed_dim * 2 (因為分別做平均池化後各得到 embed_dim 維向量)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim * 2, embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, num_classes)
        )
    
    def forward(self, ids_A, ids_B):
        # 取得 sentence_A 與 sentence_B 的 Embedding，shape: (batch, seq_length, embed_dim)
        emb_A = self.embedding_A(ids_A)
        emb_B = self.embedding_B(ids_B)
        # 對每個句子做平均池化，得到固定維度向量 (batch, embed_dim)
        pooled_A = emb_A.mean(dim=1)
        pooled_B = emb_B.mean(dim=1)
        # 串接兩個句子的向量（不相加）
        features = torch.cat([pooled_A, pooled_B], dim=1)  # shape: (batch, embed_dim*2)
        logits = self.fc(features)
        return logits


def train_and_evaluate(model, train_loader, test_loader, num_epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    epoch_list = []
    
    for epoch in range(1, num_epochs+1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for ids_A, ids_B, labels in train_loader:
            # 將文字標籤轉換為 tensor
            labels = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
            ids_A, ids_B = ids_A.to(device), ids_B.to(device)
            
            optimizer.zero_grad()
            outputs = model(ids_A, ids_B)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * ids_A.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        
        train_loss = running_loss / total
        train_acc = correct / total
        
        # 測試階段
        model.eval()
        test_running_loss = 0.0
        test_correct = 0
        test_total = 0
        with torch.no_grad():
            for ids_A, ids_B, labels in test_loader:
                labels = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
                ids_A, ids_B = ids_A.to(device), ids_B.to(device)
                outputs = model(ids_A, ids_B)
                loss = criterion(outputs, labels)
                test_running_loss += loss.item() * ids_A.size(0)
                _, preds = torch.max(outputs, 1)
                test_correct += (preds == labels).sum().item()
                test_total += labels.size(0)
        
        test_loss = test_running_loss / test_total
        test_acc = test_correct / test_total
        
        epoch_list.append(epoch)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")
    
    return epoch_list, train_losses, test_losses, train_accuracies, test_accuracies


batch_size = 512
num_epochs = 10
learning_rate = 0.001


X = df_train.copy()

In [None]:
fractions = [1.0, 0.5, 0.25, 0.1]

for frac in fractions:
    num_samples = int(len(X) * frac)
    df_train_subset = X.iloc[:num_samples].reset_index(drop=True)
    subset_percentage = int(frac * 100)
    #df_train_subset.to_csv(f"SICK_train_subset_{subset_percentage}.tsv", sep="\t", index=False)
    
    train_loader = create_dataloader(df_train_subset, vocab, max_length, batch_size, shuffle=True)
    test_loader = create_dataloader(df_test, vocab, max_length, batch_size, shuffle=False)
    
    print(f"\n[比例 {subset_percentage}%] 訓練資料筆數: {len(df_train_subset)}")
    
    model = EmbeddingDNNClassifier(vocab_size, embed_dim=128, num_classes=num_classes, dropout=0.5)
    model.to(device)
    
    epoch_list, train_losses, test_losses, train_accs, test_accs = train_and_evaluate(model, train_loader, test_loader, num_epochs, learning_rate)
    
    # 畫圖
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    axs[0].plot(epoch_list, train_losses, label="Train Loss")
    axs[0].plot(epoch_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Loss")
    axs[0].set_title(f"Loss vs Epoch (Train Subset {subset_percentage}%)")
    axs[0].legend()
    
    axs[1].plot(epoch_list, train_accs, label="Train Acc")
    axs[1].plot(epoch_list, test_accs, label="Test Acc")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Epoch (Train Subset {subset_percentage}%)")
    axs[1].legend()
    
    plt.tight_layout()
    plt.savefig(f"EmbeddingDNN_SICK_{subset_percentage}.png")
    plt.close()
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for ids_A, ids_B, labels in test_loader:
            labels_tensor = torch.tensor(le.transform(labels), dtype=torch.long).to(device)
            ids_A, ids_B = ids_A.to(device), ids_B.to(device)
            outputs = model(ids_A, ids_B)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_tensor.cpu().numpy())
    
    print(f"分類報告 (Train Subset {subset_percentage}%):")
    print(classification_report(all_labels, all_preds, target_names=le.classes_))


原始資料形狀: (9840, 3)
訓練集形狀: (7872, 3) 測試集形狀: (1968, 3)
詞彙大小: 2439
標籤類別: ['CONTRADICTION' 'ENTAILMENT' 'NEUTRAL']

[比例 100%] 訓練資料筆數: 7872
Epoch 1: Train Loss=1.0268, Train Acc=0.5487 | Test Loss=0.9674, Test Acc=0.5686
Epoch 2: Train Loss=0.9552, Train Acc=0.5686 | Test Loss=0.9403, Test Acc=0.5686
Epoch 3: Train Loss=0.9387, Train Acc=0.5686 | Test Loss=0.9283, Test Acc=0.5686
Epoch 4: Train Loss=0.9207, Train Acc=0.5686 | Test Loss=0.9122, Test Acc=0.5686
Epoch 5: Train Loss=0.9021, Train Acc=0.5696 | Test Loss=0.8944, Test Acc=0.5696
Epoch 6: Train Loss=0.8786, Train Acc=0.5742 | Test Loss=0.8742, Test Acc=0.5732
Epoch 7: Train Loss=0.8563, Train Acc=0.5833 | Test Loss=0.8535, Test Acc=0.5793
Epoch 8: Train Loss=0.8298, Train Acc=0.5910 | Test Loss=0.8342, Test Acc=0.5930
Epoch 9: Train Loss=0.8054, Train Acc=0.6004 | Test Loss=0.8177, Test Acc=0.5955
Epoch 10: Train Loss=0.7838, Train Acc=0.6133 | Test Loss=0.8050, Test Acc=0.5981
分類報告 (Train Subset 100%):
               precision    r

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss=1.0929, Train Acc=0.3863 | Test Loss=1.0778, Test Acc=0.5417
Epoch 4: Train Loss=1.0768, Train Acc=0.5235 | Test Loss=1.0617, Test Acc=0.5655
Epoch 5: Train Loss=1.0589, Train Acc=0.5464 | Test Loss=1.0456, Test Acc=0.5681
Epoch 6: Train Loss=1.0456, Train Acc=0.5578 | Test Loss=1.0293, Test Acc=0.5691
Epoch 7: Train Loss=1.0265, Train Acc=0.5591 | Test Loss=1.0133, Test Acc=0.5691
Epoch 8: Train Loss=1.0135, Train Acc=0.5616 | Test Loss=0.9979, Test Acc=0.5686
Epoch 9: Train Loss=1.0007, Train Acc=0.5616 | Test Loss=0.9839, Test Acc=0.5686
Epoch 10: Train Loss=0.9894, Train Acc=0.5616 | Test Loss=0.9718, Test Acc=0.5686
分類報告 (Train Subset 10%):
               precision    recall  f1-score   support

CONTRADICTION       0.00      0.00      0.00       285
   ENTAILMENT       0.00      0.00      0.00       564
      NEUTRAL       0.57      1.00      0.72      1119

     accuracy                           0.57      1968
    macro avg       0.19      0.33      0.24     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
