In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df_train_full = pd.read_csv('train_full.tsv', sep='\t')
df_test_split = pd.read_csv('test_split.tsv', sep='\t')
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)
print(df_train_full.head())
print(df_test_split.head())

完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)
          id        label                                          statement  \
0  7193.json         true  Northern Virginia is the most heavily traffick...   
1  1608.json    half-true  The Democratic health care bill will "collect ...   
2  3844.json  mostly-true  Says for the first time ever, Texas lawmakers ...   
3  1104.json    half-true  Only 15 percent of drug users are African-Amer...   
4  3875.json  barely-true  Part of his ride was to warn the British that ...   

                                subjects             speaker  \
0                         transportation       bob-mcdonnell   
1                      health-care,taxes         todd-tiahrt   
2  education,state-budget,state-finances         wendy-davis   
3   crime,legal-issues,marijuana,pundits  arianna-huffington   
4                        history,pundits         sarah-palin   

                      job_title     state       party  barely_true  false  \
0                 

In [None]:
le = LabelEncoder()
y_train_full = le.fit_transform(df_train_full['label'])
y_test = le.transform(df_test_split['label'])
num_classes = len(le.classes_)

def tokenize(text):
    # 簡易做法：以空白為分隔
    return text.lower().split()

# 建立詞彙表
max_vocab_size = 10000
all_tokens = []
for text in df_train_full['statement']:
    all_tokens.extend(tokenize(text))

from collections import Counter
counter = Counter(all_tokens)
# 預留 2 個位置給 <PAD> 和 <UNK>
most_common = counter.most_common(max_vocab_size - 2)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, freq in most_common:
    vocab[word] = len(vocab)
vocab_size = len(vocab)
print("詞彙大小:", vocab_size)

max_length = 50

def text_to_ids(text, vocab, max_length=50):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(ids) < max_length:
        ids = ids + [vocab['<PAD>']] * (max_length - len(ids))
    else:
        ids = ids[:max_length]
    return ids

In [None]:
class LiarDataset(Dataset):
    def __init__(self, df, vocab, max_length, label_encoder):
        self.df = df.reset_index(drop=True)
        self.vocab = vocab
        self.max_length = max_length
        self.label_encoder = label_encoder
        self.texts = df['statement'].values
        self.labels = label_encoder.transform(df['label'])

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        input_ids = torch.tensor(text_to_ids(text, self.vocab, self.max_length), dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, label

def create_dataloader(df, vocab, max_length, label_encoder, batch_size, shuffle=True):
    dataset = LiarDataset(df, vocab, max_length, label_encoder)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [None]:
class EmbeddingDNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.5):
        super(EmbeddingDNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        # 平均池化 (mean pooling) -> (batch_size, embed_dim)
        pooled = embedded.mean(dim=1)
        
        # 全連接層
        out = self.dropout(pooled)
        out = self.fc1(out)
        out = torch.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out


def train_and_evaluate_model(df_train, df_test, embed_dim, hidden_dim, output_dim, dropout, num_epochs, batch_size, lr):
    # 建立 DataLoader
    train_loader = create_dataloader(df_train, vocab, max_length, le, batch_size, shuffle=True)
    test_loader = create_dataloader(df_test, vocab, max_length, le, batch_size, shuffle=False)
    
    model = EmbeddingDNN(vocab_size, embed_dim, hidden_dim, output_dim, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # 紀錄訓練過程
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    epoch_list = []
    
    for epoch in range(1, num_epochs+1):
        # --- Training ---
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        train_loss = running_loss / total
        train_acc = correct / total
        
        # --- Testing ---
        model.eval()
        test_running_loss = 0.0
        test_correct = 0
        test_total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                test_running_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                test_correct += (predicted == labels).sum().item()
                test_total += labels.size(0)
        
        test_loss = test_running_loss / test_total
        test_acc = test_correct / test_total
        
        epoch_list.append(epoch)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f"Epoch {epoch}: "
              f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
              f"Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")
    
    return model, epoch_list, train_losses, test_losses, train_accuracies, test_accuracies


In [None]:
fractions = [1.0, 0.5, 0.25, 0.1]

# 設定超參數
embed_dim = 128
hidden_dim = 128
output_dim = num_classes
dropout = 0.5
num_epochs = 100
batch_size = max_vocab_size
learning_rate = 0.001

for frac in fractions:
    num_train_samples = int(df_train_full.shape[0] * frac)
    df_train_subset = df_train_full.iloc[:num_train_samples].reset_index(drop=True)
    
    subset_percentage = int(frac * 100)
    df_train_subset.to_csv(f'train_subset_{subset_percentage}.tsv', sep='\t', index=False)
    
    print(f"\n=== 比例 {frac} (約 {num_train_samples} 筆資料) ===")
    model, epoch_list, train_losses, test_losses, train_accuracies, test_accuracies = \
        train_and_evaluate_model(
            df_train_subset, df_test_split,
            embed_dim, hidden_dim, output_dim,
            dropout, num_epochs, batch_size, learning_rate
        )
    
    # 繪製 Loss 與 Accuracy 圖表
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    
    axs[0].plot(epoch_list, train_losses, label='Train Loss')
    axs[0].plot(epoch_list, test_losses, label='Test Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].set_ylabel('Loss')
    axs[0].set_title(f'Loss vs Epoch (Train Subset {subset_percentage}%)')
    axs[0].legend()
    
    axs[1].plot(epoch_list, train_accuracies, label='Train Accuracy')
    axs[1].plot(epoch_list, test_accuracies, label='Test Accuracy')
    axs[1].set_xlabel('Epoch')
    axs[1].set_ylabel('Accuracy')
    axs[1].set_title(f'Accuracy vs Epoch (Train Subset {subset_percentage}%)')
    axs[1].legend()
    
    plt.tight_layout()
    plt.savefig(f'training_progress_{subset_percentage}_dnn.png')
    plt.close()
    
    # 最終模型在測試集上的分類報告
    test_loader = create_dataloader(df_test_split, vocab, max_length, le, batch_size, shuffle=False)
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    print("分類報告 (Train Subset {}%):".format(subset_percentage))
    print(classification_report(all_labels, all_preds, target_names=le.classes_))


完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)
詞彙大小: 10000

=== 比例 1.0 (約 10232 筆資料) ===
Epoch 1: Train Loss=1.7945, Train Acc=0.1642 | Test Loss=1.7896, Test Acc=0.1649
Epoch 2: Train Loss=1.7908, Train Acc=0.1716 | Test Loss=1.7864, Test Acc=0.1719
Epoch 3: Train Loss=1.7871, Train Acc=0.1825 | Test Loss=1.7836, Test Acc=0.1872
Epoch 4: Train Loss=1.7839, Train Acc=0.1877 | Test Loss=1.7812, Test Acc=0.1977
Epoch 5: Train Loss=1.7808, Train Acc=0.1977 | Test Loss=1.7789, Test Acc=0.2032
Epoch 6: Train Loss=1.7794, Train Acc=0.1931 | Test Loss=1.7768, Test Acc=0.2040
Epoch 7: Train Loss=1.7766, Train Acc=0.2018 | Test Loss=1.7747, Test Acc=0.2036
Epoch 8: Train Loss=1.7743, Train Acc=0.2006 | Test Loss=1.7728, Test Acc=0.2044
Epoch 9: Train Loss=1.7721, Train Acc=0.2039 | Test Loss=1.7710, Test Acc=0.2040
Epoch 10: Train Loss=1.7699, Train Acc=0.2071 | Test Loss=1.7694, Test Acc=0.2036
Epoch 11: Train Loss=1.7693, Train Acc=0.2059 | Test Loss=1.7678, Test Acc=0.2036
Epoch 12: Train Loss=1.767

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2: Train Loss=1.7932, Train Acc=0.1789 | Test Loss=1.7904, Test Acc=0.1938
Epoch 3: Train Loss=1.7904, Train Acc=0.1826 | Test Loss=1.7886, Test Acc=0.1946
Epoch 4: Train Loss=1.7896, Train Acc=0.1918 | Test Loss=1.7869, Test Acc=0.1954
Epoch 5: Train Loss=1.7872, Train Acc=0.1964 | Test Loss=1.7852, Test Acc=0.1938
Epoch 6: Train Loss=1.7863, Train Acc=0.1935 | Test Loss=1.7837, Test Acc=0.1930
Epoch 7: Train Loss=1.7839, Train Acc=0.1970 | Test Loss=1.7822, Test Acc=0.1915
Epoch 8: Train Loss=1.7821, Train Acc=0.2088 | Test Loss=1.7807, Test Acc=0.1927
Epoch 9: Train Loss=1.7808, Train Acc=0.1962 | Test Loss=1.7792, Test Acc=0.1946
Epoch 10: Train Loss=1.7785, Train Acc=0.1984 | Test Loss=1.7778, Test Acc=0.1993
Epoch 11: Train Loss=1.7770, Train Acc=0.2002 | Test Loss=1.7765, Test Acc=0.1989
Epoch 12: Train Loss=1.7757, Train Acc=0.2035 | Test Loss=1.7752, Test Acc=0.1973
Epoch 13: Train Loss=1.7743, Train Acc=0.2084 | Test Loss=1.7739, Test Acc=0.1989
Epoch 14: Train Loss=1.7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss=1.8003, Train Acc=0.1118 | Test Loss=1.7967, Test Acc=0.1032
Epoch 4: Train Loss=1.7953, Train Acc=0.1286 | Test Loss=1.7945, Test Acc=0.1204
Epoch 5: Train Loss=1.7943, Train Acc=0.1403 | Test Loss=1.7923, Test Acc=0.1391
Epoch 6: Train Loss=1.7911, Train Acc=0.1673 | Test Loss=1.7902, Test Acc=0.1719
Epoch 7: Train Loss=1.7900, Train Acc=0.1673 | Test Loss=1.7882, Test Acc=0.1899
Epoch 8: Train Loss=1.7869, Train Acc=0.1884 | Test Loss=1.7862, Test Acc=0.2005
Epoch 9: Train Loss=1.7844, Train Acc=0.2029 | Test Loss=1.7843, Test Acc=0.2036
Epoch 10: Train Loss=1.7827, Train Acc=0.2142 | Test Loss=1.7824, Test Acc=0.2118
Epoch 11: Train Loss=1.7810, Train Acc=0.2029 | Test Loss=1.7805, Test Acc=0.2122
Epoch 12: Train Loss=1.7777, Train Acc=0.2142 | Test Loss=1.7787, Test Acc=0.2145
Epoch 13: Train Loss=1.7759, Train Acc=0.2217 | Test Loss=1.7769, Test Acc=0.2138
Epoch 14: Train Loss=1.7724, Train Acc=0.2260 | Test Loss=1.7752, Test Acc=0.2153
Epoch 15: Train Loss=1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== 比例 0.1 (約 1023 筆資料) ===
Epoch 1: Train Loss=1.7937, Train Acc=0.1916 | Test Loss=1.7899, Test Acc=0.1973
Epoch 2: Train Loss=1.7909, Train Acc=0.1906 | Test Loss=1.7884, Test Acc=0.1981
Epoch 3: Train Loss=1.7892, Train Acc=0.1926 | Test Loss=1.7869, Test Acc=0.2009
Epoch 4: Train Loss=1.7860, Train Acc=0.2004 | Test Loss=1.7854, Test Acc=0.1997
Epoch 5: Train Loss=1.7830, Train Acc=0.2111 | Test Loss=1.7840, Test Acc=0.1989
Epoch 6: Train Loss=1.7793, Train Acc=0.2297 | Test Loss=1.7826, Test Acc=0.1993
Epoch 7: Train Loss=1.7795, Train Acc=0.2170 | Test Loss=1.7813, Test Acc=0.1985
Epoch 8: Train Loss=1.7766, Train Acc=0.2209 | Test Loss=1.7800, Test Acc=0.1989
Epoch 9: Train Loss=1.7742, Train Acc=0.2072 | Test Loss=1.7787, Test Acc=0.1970
Epoch 10: Train Loss=1.7730, Train Acc=0.2082 | Test Loss=1.7774, Test Acc=0.2001
Epoch 11: Train Loss=1.7717, Train Acc=0.2170 | Test Loss=1.7762, Test Acc=0.1977
Epoch 12: Train Loss=1.7711, Train Acc=0.2268 | Test Loss=1.7750, Test Acc=0.1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
