In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder

torch.manual_seed(42)
np.random.seed(42)


columns = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state', 'party', 
           'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

# 讀取各個 TSV 檔案（注意檔案名稱需與實際檔名相符）
df_train = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
df_valid = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)
df_test = pd.read_csv('test.tsv', sep='\t', header=None, names=columns)

# 合併資料
df_total = pd.concat([df_train, df_valid, df_test], ignore_index=True)
print("合併後資料形狀:", df_total.shape)

# 存檔：合併後的資料存成 data.tsv
df_total.to_csv('data.tsv', sep='\t', index=False)


df_train_full, df_test_split = train_test_split(
    df_total, test_size=0.2, random_state=42, stratify=df_total['label'])
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)

# 存檔：訓練與測試集分別存檔
df_train_full.to_csv('train_full.tsv', sep='\t', index=False)
df_test_split.to_csv('test_split.tsv', sep='\t', index=False)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
# 先在完整訓練集上 fit，再轉換訓練與測試集的「statement」欄位
X_train_full = vectorizer.fit_transform(df_train_full['statement']).toarray()
X_test = vectorizer.transform(df_test_split['statement']).toarray()


le = LabelEncoder()
y_train_full = le.fit_transform(df_train_full['label'])
y_test = le.transform(df_test_split['label'])


# 定義要使用的訓練集比例
fractions = [1.0, 0.5, 0.25, 0.1]

# 設定最終樹的數量與每次新增的樹數（這裡以 1 為單位追蹤變化）
final_n_estimators = 200

for frac in fractions:
    num_train_samples = int(X_train_full.shape[0] * frac)
    
    # 取出對應比例的子資料集（DataFrame 與特徵、標籤）
    df_train_subset = df_train_full.iloc[:num_train_samples]
    X_train = X_train_full[:num_train_samples]
    y_train = y_train_full[:num_train_samples]
    
    subset_percentage = int(frac * 100)
    df_train_subset.to_csv(f'train_subset_{subset_percentage}.tsv', sep='\t', index=False)
    
    print(f"\n[比例 {frac}] 訓練資料筆數: {num_train_samples}")
    
    # 使用 warm_start 模式建立 RandomForestClassifier
    model = RandomForestClassifier(
        warm_start=True,    # 啟用 warm_start 以便逐步增加樹的數量
        n_estimators=0,     # 從 0 棵樹開始，後續依次累加
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42
    )
    
    # 用來紀錄每個迭代步驟的指標
    n_estimators_list = []
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    # 逐步增加樹的數量，模擬訓練過程
    for i in range(1, final_n_estimators + 1):
        model.n_estimators = i
        model.fit(X_train, y_train)
        
        # 計算預測機率，用以計算 log loss
        y_train_proba = model.predict_proba(X_train)
        y_test_proba = model.predict_proba(X_test)
        
        # 計算 log loss (注意：若某些類別預測機率過低，log_loss 可能會警告)
        train_loss = log_loss(y_train, y_train_proba)
        test_loss = log_loss(y_test, y_test_proba)
        
        # 計算 accuracy
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        # 紀錄指標
        n_estimators_list.append(i)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
    
    # 畫圖：建立一個圖表，左圖為 Loss 變化，右圖為 Accuracy 變化
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    
    axs[0].plot(n_estimators_list, train_losses, label="Train Loss")
    axs[0].plot(n_estimators_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Number of Trees")
    axs[0].set_ylabel("Log Loss")
    axs[0].set_title(f"Loss vs Trees (Train Subset {subset_percentage}%)")
    axs[0].legend()
    
    axs[1].plot(n_estimators_list, train_accuracies, label="Train Accuracy")
    axs[1].plot(n_estimators_list, test_accuracies, label="Test Accuracy")
    axs[1].set_xlabel("Number of Trees")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Trees (Train Subset {subset_percentage}%)")
    axs[1].legend()
    
    plt.tight_layout()
    # 存檔圖表
    plt.savefig(f'training_progress_{subset_percentage}.png')
    plt.close()
    
    # 另外也印出最終模型的分類報告
    final_pred = model.predict(X_test)
    print(classification_report(y_test, final_pred))



合併後資料形狀: (12791, 14)
完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)

[比例 1.0] 訓練資料筆數: 10232


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      0.00      0.01       421
           1       0.28      0.31      0.29       501
           2       0.22      0.74      0.33       526
           3       0.29      0.11      0.16       491
           4       1.00      0.00      0.01       209
           5       0.00      0.00      0.00       411

    accuracy                           0.24      2559
   macro avg       0.46      0.20      0.14      2559
weighted avg       0.40      0.24      0.16      2559


[比例 0.5] 訓練資料筆數: 5116


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      0.00      0.00       421
           1       0.24      0.51      0.33       501
           2       0.23      0.51      0.32       526
           3       0.31      0.19      0.23       491
           4       0.00      0.00      0.00       209
           5       0.00      0.00      0.00       411

    accuracy                           0.24      2559
   macro avg       0.30      0.20      0.15      2559
weighted avg       0.32      0.24      0.17      2559


[比例 0.25] 訓練資料筆數: 2558


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.67      0.00      0.01       421
           1       0.24      0.45      0.32       501
           2       0.23      0.49      0.31       526
           3       0.27      0.29      0.28       491
           4       0.00      0.00      0.00       209
           5       0.40      0.00      0.01       411

    accuracy                           0.24      2559
   macro avg       0.30      0.20      0.15      2559
weighted avg       0.32      0.24      0.18      2559


[比例 0.1] 訓練資料筆數: 1023
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       421
           1       0.22      0.80      0.34       501
           2       0.27      0.08      0.12       526
           3       0.25      0.26      0.26       491
           4       0.00      0.00      0.00       209
           5       0.24      0.03      0.05       411

    accuracy                           0.23      2559


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
