In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df_train_full = pd.read_csv('train_full.tsv', sep='\t')
df_test_split = pd.read_csv('test_split.tsv', sep='\t')

In [2]:
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)
print("訓練集標籤分布:\n", df_train_full['label'].value_counts())
print("測試集標籤分布:\n", df_test_split['label'].value_counts())
print("訓練集前5筆資料:\n", df_train_full.head())
print("測試集前5筆資料:\n", df_test_split.head())


完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)
訓練集標籤分布:
 label
half-true      2101
false          2006
mostly-true    1963
barely-true    1682
true           1642
pants-fire      838
Name: count, dtype: int64
測試集標籤分布:
 label
half-true      526
false          501
mostly-true    491
barely-true    421
true           411
pants-fire     209
Name: count, dtype: int64
訓練集前5筆資料:
           id        label                                          statement  \
0  7193.json         true  Northern Virginia is the most heavily traffick...   
1  1608.json    half-true  The Democratic health care bill will "collect ...   
2  3844.json  mostly-true  Says for the first time ever, Texas lawmakers ...   
3  1104.json    half-true  Only 15 percent of drug users are African-Amer...   
4  3875.json  barely-true  Part of his ride was to warn the British that ...   

                                subjects             speaker  \
0                         transportation       bob-mcdonnell   
1                      

In [3]:
le = LabelEncoder()

y_train_full = le.fit_transform(df_train_full['label'])
y_test = le.transform(df_test_split['label'])
num_classes = len(le.classes_)

def tokenize(text):
    return text.lower().split()

from collections import Counter
max_vocab_size = 10000
all_tokens = []
for text in df_train_full['statement']:
    all_tokens.extend(tokenize(text))
counter = Counter(all_tokens)
most_common = counter.most_common(max_vocab_size - 2)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, freq in most_common:
    vocab[word] = len(vocab)
vocab_size = len(vocab)

max_length = 50
def text_to_ids(text, vocab, max_length=50):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(ids) < max_length:
        ids = ids + [vocab['<PAD>']] * (max_length - len(ids))
    else:
        ids = ids[:max_length]
    return ids

class LiarDataset(Dataset):
    def __init__(self, df, vocab, max_length=50, label_encoder=None):
        self.df = df
        self.vocab = vocab
        self.max_length = max_length
        self.label_encoder = label_encoder
        if label_encoder is not None:
            self.labels = label_encoder.transform(df['label'])
        else:
            self.labels = df['label'].values
        self.texts = df['statement'].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        input_ids = torch.tensor(text_to_ids(text, self.vocab, self.max_length), dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, label

def create_dataloader(df, vocab, max_length, label_encoder, batch_size, shuffle=True):
    dataset = LiarDataset(df, vocab, max_length, label_encoder)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            bidirectional=bidirectional, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]
        out = self.dropout(hidden)
        out = self.fc(out)
        return out


embed_dim = 128
hidden_dim = 128
num_layers = 1
bidirectional = False
dropout = 0.5
num_epochs = 30
batch_size = 4096
output_dim = num_classes

In [None]:
fractions = [1.0, 0.5, 0.25, 0.1]

for frac in fractions:
    num_train_samples = int(df_train_full.shape[0] * frac) 
    df_train_subset = df_train_full.iloc[:num_train_samples].reset_index(drop=True)
    subset_percentage = int(frac * 100)
    # 存檔子資料集（若需要另存）
    df_train_subset.to_csv(f'train_subset_{subset_percentage}.tsv', sep='\t', index=False)
    
    print(f"\n[比例 {frac}] 訓練資料筆數: {num_train_samples}")
    
    # DataLoader
    train_loader = create_dataloader(df_train_subset, vocab, max_length, le, batch_size, shuffle=True)
    test_loader = create_dataloader(df_test_split, vocab, max_length, le, batch_size, shuffle=False)
    
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    epoch_list = []
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_loss = running_loss / total
        train_acc = correct / total
        
        # 驗證階段
        model.eval()
        test_loss_total = 0.0
        test_correct = 0
        test_total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                test_loss_total += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        test_loss = test_loss_total / test_total
        test_acc = test_correct / test_total
        
        epoch_list.append(epoch)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")
    
    # 畫出訓練過程圖表：左側 Loss 變化，右側 Accuracy 變化
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    
    axs[0].plot(epoch_list, train_losses, label="Train Loss")
    axs[0].plot(epoch_list, test_losses, label="Test Loss")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Loss")
    axs[0].set_title(f"Loss vs Epoch (Train Subset {subset_percentage}%)")
    axs[0].legend()
    
    axs[1].plot(epoch_list, train_accuracies, label="Train Accuracy")
    axs[1].plot(epoch_list, test_accuracies, label="Test Accuracy")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title(f"Accuracy vs Epoch (Train Subset {subset_percentage}%)")
    axs[1].legend()
    
    plt.tight_layout()
    plt.savefig(f'training_progress_{subset_percentage}_lstm.png')
    plt.close()
    
    # 使用最終模型對測試集做預測，印出分類報告
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    print("分類報告 (Train Subset {}%):".format(subset_percentage))
    print(classification_report(all_labels, all_preds, target_names=le.classes_))


[比例 1.0] 訓練資料筆數: 10232
Epoch 1: Train Loss=1.7890, Train Acc=0.1944 | Test Loss=1.7831, Test Acc=0.1962
Epoch 2: Train Loss=1.7814, Train Acc=0.1936 | Test Loss=1.7762, Test Acc=0.1966
Epoch 3: Train Loss=1.7743, Train Acc=0.2001 | Test Loss=1.7689, Test Acc=0.2055
Epoch 4: Train Loss=1.7668, Train Acc=0.2011 | Test Loss=1.7612, Test Acc=0.2055
Epoch 5: Train Loss=1.7616, Train Acc=0.2044 | Test Loss=1.7597, Test Acc=0.2055
Epoch 6: Train Loss=1.7605, Train Acc=0.2038 | Test Loss=1.7567, Test Acc=0.2055
Epoch 7: Train Loss=1.7570, Train Acc=0.1993 | Test Loss=1.7576, Test Acc=0.1954
Epoch 8: Train Loss=1.7576, Train Acc=0.2091 | Test Loss=1.7578, Test Acc=0.1958
Epoch 9: Train Loss=1.7577, Train Acc=0.1990 | Test Loss=1.7572, Test Acc=0.1954
Epoch 10: Train Loss=1.7577, Train Acc=0.2014 | Test Loss=1.7566, Test Acc=0.2055
Epoch 11: Train Loss=1.7570, Train Acc=0.1998 | Test Loss=1.7564, Test Acc=0.2055
Epoch 12: Train Loss=1.7571, Train Acc=0.1998 | Test Loss=1.7563, Test Acc=0.2059
E

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2: Train Loss=1.7863, Train Acc=0.1865 | Test Loss=1.7811, Test Acc=0.1923
Epoch 3: Train Loss=1.7806, Train Acc=0.1880 | Test Loss=1.7758, Test Acc=0.1923
Epoch 4: Train Loss=1.7759, Train Acc=0.1972 | Test Loss=1.7704, Test Acc=0.1958
Epoch 5: Train Loss=1.7695, Train Acc=0.1980 | Test Loss=1.7647, Test Acc=0.1966
Epoch 6: Train Loss=1.7650, Train Acc=0.1988 | Test Loss=1.7597, Test Acc=0.1962
Epoch 7: Train Loss=1.7601, Train Acc=0.2050 | Test Loss=1.7602, Test Acc=0.1966
Epoch 8: Train Loss=1.7628, Train Acc=0.1931 | Test Loss=1.7584, Test Acc=0.2059
Epoch 9: Train Loss=1.7602, Train Acc=0.1935 | Test Loss=1.7572, Test Acc=0.2063
Epoch 10: Train Loss=1.7574, Train Acc=0.2056 | Test Loss=1.7575, Test Acc=0.2063
Epoch 11: Train Loss=1.7579, Train Acc=0.2052 | Test Loss=1.7578, Test Acc=0.2063
Epoch 12: Train Loss=1.7584, Train Acc=0.2009 | Test Loss=1.7578, Test Acc=0.2067
Epoch 13: Train Loss=1.7577, Train Acc=0.2060 | Test Loss=1.7575, Test Acc=0.2067
Epoch 14: Train Loss=1.7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2: Train Loss=1.7985, Train Acc=0.1701 | Test Loss=1.7957, Test Acc=0.1606
Epoch 3: Train Loss=1.7961, Train Acc=0.1607 | Test Loss=1.7929, Test Acc=0.1610
Epoch 4: Train Loss=1.7933, Train Acc=0.1642 | Test Loss=1.7901, Test Acc=0.1614
Epoch 5: Train Loss=1.7907, Train Acc=0.1579 | Test Loss=1.7874, Test Acc=0.1657
Epoch 6: Train Loss=1.7870, Train Acc=0.1732 | Test Loss=1.7847, Test Acc=0.2059
Epoch 7: Train Loss=1.7841, Train Acc=0.1849 | Test Loss=1.7819, Test Acc=0.2059
Epoch 8: Train Loss=1.7813, Train Acc=0.1833 | Test Loss=1.7790, Test Acc=0.2063
Epoch 9: Train Loss=1.7788, Train Acc=0.1959 | Test Loss=1.7761, Test Acc=0.2063
Epoch 10: Train Loss=1.7755, Train Acc=0.1998 | Test Loss=1.7730, Test Acc=0.2063
Epoch 11: Train Loss=1.7724, Train Acc=0.1978 | Test Loss=1.7697, Test Acc=0.2063
Epoch 12: Train Loss=1.7707, Train Acc=0.2029 | Test Loss=1.7664, Test Acc=0.2063
Epoch 13: Train Loss=1.7652, Train Acc=0.2084 | Test Loss=1.7632, Test Acc=0.1981
Epoch 14: Train Loss=1.7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss=1.7816, Train Acc=0.1799 | Test Loss=1.7783, Test Acc=0.1923
Epoch 4: Train Loss=1.7792, Train Acc=0.1935 | Test Loss=1.7764, Test Acc=0.1923
Epoch 5: Train Loss=1.7764, Train Acc=0.1857 | Test Loss=1.7745, Test Acc=0.1923
Epoch 6: Train Loss=1.7744, Train Acc=0.2004 | Test Loss=1.7726, Test Acc=0.1923
Epoch 7: Train Loss=1.7721, Train Acc=0.2004 | Test Loss=1.7708, Test Acc=0.1927
Epoch 8: Train Loss=1.7702, Train Acc=0.1877 | Test Loss=1.7688, Test Acc=0.1927
Epoch 9: Train Loss=1.7676, Train Acc=0.2170 | Test Loss=1.7668, Test Acc=0.1927
Epoch 10: Train Loss=1.7674, Train Acc=0.1877 | Test Loss=1.7648, Test Acc=0.1927
Epoch 11: Train Loss=1.7627, Train Acc=0.1877 | Test Loss=1.7628, Test Acc=0.1970
Epoch 12: Train Loss=1.7602, Train Acc=0.2102 | Test Loss=1.7611, Test Acc=0.1966
Epoch 13: Train Loss=1.7601, Train Acc=0.1975 | Test Loss=1.7604, Test Acc=0.1966
Epoch 14: Train Loss=1.7595, Train Acc=0.2209 | Test Loss=1.7614, Test Acc=0.1966
Epoch 15: Train Loss=1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
