In [None]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 設定隨機種子與裝置
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


df = pd.read_csv("SICK.txt", sep="\t")
# 只保留所需欄位
df = df[["sentence_A", "sentence_B", "entailment_label"]]
print("原始資料形狀:", df.shape)

# 若需要可先存檔整理後的資料
df.to_csv("SICK_filtered.tsv", sep="\t", index=False)

# 載入 BERT tokenizer 與模型（英文資料集，此處以 bert-base-uncased 為例）
pretrained_model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
bert_model = BertModel.from_pretrained(pretrained_model)
bert_model.to(device)
bert_model.eval()

  from .autonotebook import tqdm as notebook_tqdm


原始資料形狀: (9840, 3)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [3]:
from tqdm import tqdm


# 將 sentence_A 與 sentence_B 轉成列表
sentences_A = df["sentence_A"].tolist()
sentences_B = df["sentence_B"].tolist()

batch_size_encode = 2048  # 可根據硬體資源調整
embeddings_A = []
embeddings_B = []

with torch.no_grad():
    # 逐批編碼 sentence_A
    for i in tqdm(range(0, len(sentences_A), batch_size_encode), desc="Encoding sentence_A"):
        batch_texts = sentences_A[i:i+batch_size_encode]
        encodings = tokenizer.batch_encode_plus(
            batch_texts,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        # 取 pooler_output (shape: [batch, 768])
        embeddings_A.append(outputs.pooler_output.cpu().numpy())
        
    # 逐批編碼 sentence_B
    for i in tqdm(range(0, len(sentences_B), batch_size_encode), desc="Encoding sentence_B"):
        batch_texts = sentences_B[i:i+batch_size_encode]
        encodings = tokenizer.batch_encode_plus(
            batch_texts,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings_B.append(outputs.pooler_output.cpu().numpy())

# 合併所有批次的結果
embeddings_A = np.concatenate(embeddings_A, axis=0)
embeddings_B = np.concatenate(embeddings_B, axis=0)

# 將 sentence_A 與 sentence_B 的表示串接（concatenate，非相加）
X_features = np.concatenate([embeddings_A, embeddings_B], axis=1)  # shape: (n_samples, 1536)
print("Feature matrix shape:", X_features.shape)

# 標籤編碼（entailment_label）
le = LabelEncoder()
y = le.fit_transform(df["entailment_label"])
print("Label classes:", le.classes_)

# 存檔 embeddings 與標籤
np.save("SICK_embeddings.npy", X_features)
np.save("SICK_labels.npy", y)

Encoding sentence_A: 100%|██████████| 5/5 [00:24<00:00,  4.97s/it]
Encoding sentence_B: 100%|██████████| 5/5 [00:24<00:00,  4.89s/it]


Feature matrix shape: (9840, 1536)
Label classes: ['CONTRADICTION' 'ENTAILMENT' 'NEUTRAL']


In [None]:
# 載入先前存好的 embeddings 與標籤
X_features = np.load("SICK_embeddings.npy")
y = np.load("SICK_labels.npy")
print("Loaded embeddings shape:", X_features.shape)

# 以 80%/20% 切分訓練與測試集（stratify 依據標籤）
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)
print("Train shape:", X_train_full.shape, "Test shape:", X_test.shape)

# 可選：另存完整訓練與測試集
#np.save("SICK_X_train_full.npy", X_train_full)
#np.save("SICK_X_test.npy", X_test)
#np.save("SICK_y_train_full.npy", y_train_full)
#np.save("SICK_y_test.npy", y_test)



In [6]:

# 定義要使用的訓練資料比例（100%, 50%, 25%, 10%）
fractions = [1.0, 0.5, 0.25, 0.1]
final_n_estimators = 200  # 逐步增加樹數

for frac in fractions:
    num_train_samples = int(X_train_full.shape[0] * frac)
    # 取出該比例的子資料集
    X_train = X_train_full[:num_train_samples]
    y_train = y_train_full[:num_train_samples]
    
    print(f"\n[比例 {int(frac*100)}%] 訓練資料筆數: {num_train_samples}")
    
    # 可另存該子資料集
    np.save(f"SICK_X_train_{int(frac*100)}.npy", X_train)
    np.save(f"SICK_y_train_{int(frac*100)}.npy", y_train)
    
    # 使用 warm_start 模式建立 Random Forest
    model_rf = RandomForestClassifier(
        warm_start=True, 
        n_estimators=0,   
        min_samples_split=5, 
        min_samples_leaf=2, 
        max_features="sqrt",
        random_state=42
    )
    
    n_estimators_list = []
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for i in range(1, final_n_estimators + 1):
        model_rf.n_estimators = i
        model_rf.fit(X_train, y_train)
        
        y_train_proba = model_rf.predict_proba(X_train)
        y_test_proba = model_rf.predict_proba(X_test)
        train_loss = log_loss(y_train, y_train_proba)
        test_loss = log_loss(y_test, y_test_proba)
        
        y_train_pred = model_rf.predict(X_train)
        y_test_pred = model_rf.predict(X_test)
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        n_estimators_list.append(i)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
    
    # 繪製 Loss 與 Accuracy 變化圖
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    axs[0].plot(n_estimators_list, train_losses, label="Train Loss")
    axs[0].plot(n_estimators_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Number of Trees")
    axs[0].set_ylabel("Log Loss")
    axs[0].set_title(f"Loss vs Trees (Train Subset {int(frac*100)}%)")
    axs[0].legend()
    
    axs[1].plot(n_estimators_list, train_accuracies, label="Train Accuracy")
    axs[1].plot(n_estimators_list, test_accuracies, label="Test Accuracy")
    axs[1].set_xlabel("Number of Trees")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Trees (Train Subset {int(frac*100)}%)")
    axs[1].legend()
    
    plt.tight_layout()
    plt.savefig(f"RF_SICK_{int(frac*100)}.png")
    plt.close()
    
    final_pred = model_rf.predict(X_test)
    print(classification_report(y_test, final_pred, target_names=le.classes_))



[比例 100%] 訓練資料筆數: 7872
               precision    recall  f1-score   support

CONTRADICTION       0.57      0.32      0.41       285
   ENTAILMENT       0.55      0.20      0.29       564
      NEUTRAL       0.60      0.87      0.71      1119

     accuracy                           0.60      1968
    macro avg       0.58      0.46      0.47      1968
 weighted avg       0.58      0.60      0.55      1968


[比例 50%] 訓練資料筆數: 3936
               precision    recall  f1-score   support

CONTRADICTION       0.56      0.25      0.34       285
   ENTAILMENT       0.51      0.15      0.23       564
      NEUTRAL       0.59      0.89      0.71      1119

     accuracy                           0.58      1968
    macro avg       0.55      0.43      0.43      1968
 weighted avg       0.56      0.58      0.52      1968


[比例 25%] 訓練資料筆數: 1968
               precision    recall  f1-score   support

CONTRADICTION       0.51      0.19      0.28       285
   ENTAILMENT       0.52      0.11      0.1