In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# transformers
from transformers import BertTokenizer, BertModel


torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


df_train_full = pd.read_csv('train_full.tsv', sep='\t')
df_test_split = pd.read_csv('test_split.tsv', sep='\t')
print("完整訓練集形狀:", df_train_full.shape)
print("測試集形狀:", df_test_split.shape)
print(df_train_full.head())


完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)
          id        label                                          statement  \
0  7193.json         true  Northern Virginia is the most heavily traffick...   
1  1608.json    half-true  The Democratic health care bill will "collect ...   
2  3844.json  mostly-true  Says for the first time ever, Texas lawmakers ...   
3  1104.json    half-true  Only 15 percent of drug users are African-Amer...   
4  3875.json  barely-true  Part of his ride was to warn the British that ...   

                                subjects             speaker  \
0                         transportation       bob-mcdonnell   
1                      health-care,taxes         todd-tiahrt   
2  education,state-budget,state-finances         wendy-davis   
3   crime,legal-issues,marijuana,pundits  arianna-huffington   
4                        history,pundits         sarah-palin   

                      job_title     state       party  barely_true  false  \
0                 

In [None]:
le = LabelEncoder()
y_train_full = le.fit_transform(df_train_full['label'])
y_test = le.transform(df_test_split['label'])
num_classes = len(le.classes_)


pretrained_model_name = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
max_length = 128  # 每個樣本最長序列長度

In [None]:
class BERTDataset(Dataset):
    def __init__(self, df, label_encoder, tokenizer, max_length=128):
        self.df = df.reset_index(drop=True)
        self.texts = self.df['statement'].tolist()
        self.labels = label_encoder.transform(self.df['label'])
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # 使用 BERT tokenizer 對文本編碼
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # output: input_ids, attention_mask (shape: [1, max_length])
        input_ids = encoding['input_ids'].squeeze(0)       # -> [max_length]
        attention_mask = encoding['attention_mask'].squeeze(0)  # -> [max_length]
        
        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def create_dataloader(df, label_encoder, tokenizer, max_length, batch_size, shuffle=True):
    dataset = BERTDataset(df, label_encoder, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


class BertDNN(nn.Module):
    def __init__(self, pretrained_model_name, hidden_dim, num_classes, freeze_bert=False):
        super(BertDNN, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        
        # 是否凍結 BERT 權重
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        bert_hidden_size = self.bert.config.hidden_size
        
        # DNN 分類器
        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # BERT 輸出: last_hidden_state, pooler_output, (hidden_states), (attentions)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        pooled_output = outputs.pooler_output 
        
        # 通過自定義的 DNN
        logits = self.classifier(pooled_output)
        return logits

In [None]:
def train_and_evaluate(model, train_loader, test_loader, num_epochs, lr=1e-4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    epoch_list = []
    
    for epoch in range(1, num_epochs+1):
        # --- Training ---
        model.train()
        running_loss = 0.0 
        correct = 0 
        total = 0 
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * input_ids.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        train_loss = running_loss / total
        train_acc = correct / total
        
        # --- Testing ---
        model.eval()
        test_running_loss = 0.0
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                
                test_running_loss += loss.item() * input_ids.size(0)
                _, predicted = torch.max(outputs, 1)
                test_correct += (predicted == labels).sum().item()
                test_total += labels.size(0)
        
        test_loss = test_running_loss / test_total
        test_acc = test_correct / test_total
        
        epoch_list.append(epoch)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f"Epoch {epoch}: "
              f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
              f"Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")
    
    return epoch_list, train_losses, test_losses, train_accuracies, test_accuracies


In [None]:
fractions = [1.0, 0.5, 0.25, 0.1]

# 超參數設定
hidden_dim = 128        
num_epochs = 3       
batch_size = 64      
learning_rate = 5e-5    

for frac in fractions:
    num_train_samples = int(df_train_full.shape[0] * frac)
    df_train_subset = df_train_full.iloc[:num_train_samples].reset_index(drop=True)
    
    subset_percentage = int(frac * 100)
    df_train_subset.to_csv(f'train_subset_{subset_percentage}.tsv', sep='\t', index=False)
    
    print(f"\n=== 比例 {frac} (約 {num_train_samples} 筆資料) ===")
    
    # 建立 DataLoader
    train_loader = create_dataloader(df_train_subset, le, tokenizer, max_length, batch_size, shuffle=True)
    test_loader = create_dataloader(df_test_split, le, tokenizer, max_length, batch_size, shuffle=False)
    
    # 初始化 BERT + DNN
    model = BertDNN(pretrained_model_name, hidden_dim, num_classes, freeze_bert=False).to(device)
    
    # 訓練與評估
    epoch_list, train_losses, test_losses, train_accuracies, test_accuracies = \
        train_and_evaluate(model, train_loader, test_loader, num_epochs, lr=learning_rate)
    
    # 繪製曲線
    plt.figure(figsize=(12, 5))
    
    # Loss 圖
    plt.subplot(1, 2, 1)
    plt.plot(epoch_list, train_losses, label='Train Loss')
    plt.plot(epoch_list, test_losses, label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Loss vs Epoch (Train Subset {subset_percentage}%)')
    plt.legend()
    
    # Accuracy 圖
    plt.subplot(1, 2, 2)
    plt.plot(epoch_list, train_accuracies, label='Train Acc')
    plt.plot(epoch_list, test_accuracies, label='Test Acc')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'Accuracy vs Epoch (Train Subset {subset_percentage}%)')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'training_progress_{subset_percentage}_bert_dnn.png')
    plt.close()
    
    # 最終模型的分類報告
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    print(f"分類報告 (Train Subset {subset_percentage}%):")
    print(classification_report(all_labels, all_preds, target_names=le.classes_))


  from .autonotebook import tqdm as notebook_tqdm


完整訓練集形狀: (10232, 14)
測試集形狀: (2559, 14)

=== 比例 1.0 (約 10232 筆資料) ===
Epoch 1: Train Loss=1.7628, Train Acc=0.2040 | Test Loss=1.7230, Test Acc=0.2352
Epoch 2: Train Loss=1.7181, Train Acc=0.2341 | Test Loss=1.6936, Test Acc=0.2567
Epoch 3: Train Loss=1.6284, Train Acc=0.2937 | Test Loss=1.6922, Test Acc=0.2712
分類報告 (Train Subset 100%):
              precision    recall  f1-score   support

 barely-true       0.26      0.22      0.24       421
       false       0.32      0.33      0.32       501
   half-true       0.24      0.25      0.25       526
 mostly-true       0.28      0.40      0.33       491
  pants-fire       0.27      0.15      0.20       209
        true       0.25      0.18      0.21       411

    accuracy                           0.27      2559
   macro avg       0.27      0.26      0.26      2559
weighted avg       0.27      0.27      0.27      2559


=== 比例 0.5 (約 5116 筆資料) ===
Epoch 1: Train Loss=1.7614, Train Acc=0.2113 | Test Loss=1.7229, Test Acc=0.2450
Epoch 2: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
