In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder

torch.manual_seed(42)
np.random.seed(42)

columns = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state', 'party', 
           'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

# 讀取各個 TSV 檔案（注意檔案名稱需與實際檔名相符）
df_train = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
df_valid = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)
df_test = pd.read_csv('test.tsv', sep='\t', header=None, names=columns)

# 合併資料
df_total = pd.concat([df_train, df_valid, df_test], ignore_index=True)
print("合併後資料形狀:", df_total.shape)

# 篩選出 label 為 'true', 'half-true', 'false' 的資料
df_total = df_total[df_total['label'].isin(['true', 'half-true', 'false'])]
print("篩選後資料形狀:", df_total.shape)

# 存檔：合併後的資料存成 data.tsv
df_total.to_csv('data.tsv', sep='\t', index=False)

df_train_full, df_test_split = train_test_split(
    df_total, test_size=0.2, random_state=42, stratify=df_total['label'])
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)

# 存檔：訓練與測試集分別存檔
df_train_full.to_csv('train_full.tsv', sep='\t', index=False)
df_test_split.to_csv('test_split.tsv', sep='\t', index=False)

vectorizer = TfidfVectorizer(max_features=5000)
# 先在完整訓練集上 fit，再轉換訓練與測試集的「statement」欄位
X_train_full = vectorizer.fit_transform(df_train_full['statement']).toarray()
X_test = vectorizer.transform(df_test_split['statement']).toarray()

le = LabelEncoder()
y_train_full = le.fit_transform(df_train_full['label'])
y_test = le.transform(df_test_split['label'])


# 定義要使用的訓練集比例
fractions = [1.0, 0.5, 0.25, 0.1]

# 設定最終樹的數量與每次新增的樹數（這裡以 1 為單位追蹤變化）
final_n_estimators = 200

for frac in fractions:
    num_train_samples = int(X_train_full.shape[0] * frac)
    
    # 取出對應比例的子資料集（DataFrame 與特徵、標籤）
    df_train_subset = df_train_full.iloc[:num_train_samples]
    X_train = X_train_full[:num_train_samples]
    y_train = y_train_full[:num_train_samples]
    
    subset_percentage = int(frac * 100)
    df_train_subset.to_csv(f'train_subset_{subset_percentage}.tsv', sep='\t', index=False)
    
    print(f"\n[比例 {frac}] 訓練資料筆數: {num_train_samples}")
    
    # 使用 warm_start 模式建立 RandomForestClassifier
    model = RandomForestClassifier(
        warm_start=True,    # 啟用 warm_start 以便逐步增加樹的數量
        n_estimators=0,     # 從 0 棵樹開始，後續依次累加
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42
    )
    
    # 用來紀錄每個迭代步驟的指標
    n_estimators_list = []
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    # 逐步增加樹的數量，模擬訓練過程
    for i in range(1, final_n_estimators + 1):
        model.n_estimators = i
        model.fit(X_train, y_train)
        
        # 計算預測機率，用以計算 log loss
        y_train_proba = model.predict_proba(X_train)
        y_test_proba = model.predict_proba(X_test)
        
        # 計算 log loss (注意：若某些類別預測機率過低，log_loss 可能會警告)
        train_loss = log_loss(y_train, y_train_proba)
        test_loss = log_loss(y_test, y_test_proba)
        
        # 計算 accuracy
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        # 紀錄指標
        n_estimators_list.append(i)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
    
    # 畫圖：建立一個圖表，左圖為 Loss 變化，右圖為 Accuracy 變化
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    
    axs[0].plot(n_estimators_list, train_losses, label="Train Loss")
    axs[0].plot(n_estimators_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Number of Trees")
    axs[0].set_ylabel("Log Loss")
    axs[0].set_title(f"Loss vs Trees (Train Subset {subset_percentage}%)")
    axs[0].legend()
    
    axs[1].plot(n_estimators_list, train_accuracies, label="Train Accuracy")
    axs[1].plot(n_estimators_list, test_accuracies, label="Test Accuracy")
    axs[1].set_xlabel("Number of Trees")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Trees (Train Subset {subset_percentage}%)")
    axs[1].legend()
    
    plt.tight_layout()
    # 存檔圖表
    plt.savefig(f'RF_{subset_percentage}.png')
    plt.close()
    
    # 另外也印出最終模型的分類報告
    final_pred = model.predict(X_test)
    print(classification_report(y_test, final_pred))



合併後資料形狀: (12791, 14)
篩選後資料形狀: (7187, 14)
完整訓練集形狀: (5749, 14)
測試集形狀: (1438, 14)

[比例 1.0] 訓練資料筆數: 5749
              precision    recall  f1-score   support

           0       0.47      0.38      0.42       501
           1       0.39      0.77      0.52       526
           2       1.00      0.00      0.00       411

    accuracy                           0.41      1438
   macro avg       0.62      0.38      0.31      1438
weighted avg       0.59      0.41      0.34      1438


[比例 0.5] 訓練資料筆數: 2874
              precision    recall  f1-score   support

           0       0.49      0.22      0.30       501
           1       0.39      0.89      0.54       526
           2       0.00      0.00      0.00       411

    accuracy                           0.40      1438
   macro avg       0.29      0.37      0.28      1438
weighted avg       0.31      0.40      0.30      1438


[比例 0.25] 訓練資料筆數: 1437
              precision    recall  f1-score   support

           0       0.42      0.27 

In [3]:
import torch
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# 設定隨機種子
torch.manual_seed(42)
np.random.seed(42)

# 定義欄位名稱（依 LIAR 資料集格式）
columns = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state', 'party', 
           'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

# 讀取三個 TSV 資料集
df_train = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
df_valid = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)
df_test = pd.read_csv('test.tsv', sep='\t', header=None, names=columns)

# 合併資料
df_total = pd.concat([df_train, df_valid, df_test], ignore_index=True)
print("合併後資料形狀:", df_total.shape)

# 只保留標籤為 "true", "half-true", "false" 的資料
df_total = df_total[df_total['label'].isin(['true', 'half-true', 'false'])]
print("篩選後資料形狀:", df_total.shape)

# 存檔合併後的資料（data.tsv）
#df_total.to_csv('data.tsv', sep='\t', index=False)

# 切分成訓練與測試集（80% / 20%）
df_train_full, df_test_split = train_test_split(
    df_total, test_size=0.2, random_state=42, stratify=df_total['label'])
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)

# 存檔訓練與測試集
#df_train_full.to_csv('train_full.tsv', sep='\t', index=False)
#df_test_split.to_csv('test_split.tsv', sep='\t', index=False)

# 建立 TF-IDF 向量化器，並轉換文本（以 "statement" 欄位為例）
vectorizer = TfidfVectorizer(max_features=5000)
X_train_full = vectorizer.fit_transform(df_train_full['statement']).toarray()
X_test = vectorizer.transform(df_test_split['statement']).toarray()

# ----------------------------
# 建立 Classifier A: "true" vs "not true"
# ----------------------------
# 定義轉換函數：如果原始標籤為 "true"，保留；否則轉為 "not true"
def map_label_A(label):
    return 'true' if label == 'true' else 'not true'

df_train_full['label_A'] = df_train_full['label'].apply(map_label_A)
df_test_split['label_A'] = df_test_split['label'].apply(map_label_A)

le_A = LabelEncoder()
y_train_A = le_A.fit_transform(df_train_full['label_A'])
y_test_A = le_A.transform(df_test_split['label_A'])

# ----------------------------
# 建立 Classifier B: 區分 "half-true" 與 "false"
# ----------------------------
# 只挑出原始標籤不為 "true" 的資料來訓練 classifier B
df_train_B = df_train_full[df_train_full['label'] != 'true'].reset_index(drop=True)
df_test_B = df_test_split[df_test_split['label'] != 'true'].reset_index(drop=True)

le_B = LabelEncoder()
y_train_B = le_B.fit_transform(df_train_B['label'])
y_test_B = le_B.transform(df_test_B['label'])

# 分別取得對應的 TF-IDF 特徵 (利用相同 vectorizer)
X_train_B = vectorizer.transform(df_train_B['statement']).toarray()
X_test_B = vectorizer.transform(df_test_B['statement']).toarray()

# ----------------------------
# 訓練 Classifier A (區分 "true" vs "not true")
# ----------------------------
model_A = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)
model_A.fit(X_train_full, y_train_A)
pred_A = model_A.predict(X_test)

print("Classifier A (true vs not true):")
print(classification_report(y_test_A, pred_A, target_names=le_A.classes_))

# ----------------------------
# 訓練 Classifier B (區分 "half-true" vs "false")
# ----------------------------
model_B = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)
model_B.fit(X_train_B, y_train_B)
pred_B = model_B.predict(X_test_B)

print("Classifier B (half-true vs false):")
print(classification_report(y_test_B, pred_B, target_names=le_B.classes_))

# ----------------------------
# 結合兩階段分類器進行最終預測
# ----------------------------
final_predictions = []
for i in range(len(X_test)):
    # 先用 classifier A 預測
    pred_A_label = le_A.inverse_transform([model_A.predict(X_test[i].reshape(1, -1))[0]])[0]
    if pred_A_label == 'true':
        final_predictions.append('true')
    else:
        # 若預測為 "not true"，則用 classifier B 進行進一步細分
        pred_B_label = le_B.inverse_transform([model_B.predict(X_test[i].reshape(1, -1))[0]])[0]
        final_predictions.append(pred_B_label)

# 評估最終組合後的預測結果（使用原始三分類標籤）
print("最終組合分類器效能:")
print(classification_report(df_test_split['label'], final_predictions))


合併後資料形狀: (12791, 14)
篩選後資料形狀: (7187, 14)
完整訓練集形狀: (5749, 14)
測試集形狀: (1438, 14)
Classifier A (true vs not true):
              precision    recall  f1-score   support

    not true       0.71      1.00      0.83      1027
        true       0.00      0.00      0.00       411

    accuracy                           0.71      1438
   macro avg       0.36      0.50      0.42      1438
weighted avg       0.51      0.71      0.60      1438



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classifier B (half-true vs false):
              precision    recall  f1-score   support

       false       0.62      0.45      0.52       501
   half-true       0.58      0.73      0.65       526

    accuracy                           0.59      1027
   macro avg       0.60      0.59      0.59      1027
weighted avg       0.60      0.59      0.59      1027

最終組合分類器效能:
              precision    recall  f1-score   support

       false       0.45      0.45      0.45       501
   half-true       0.41      0.73      0.53       526
        true       0.00      0.00      0.00       411

    accuracy                           0.42      1438
   macro avg       0.29      0.39      0.33      1438
weighted avg       0.31      0.42      0.35      1438



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
