In [12]:
%pip install weave wandb
%pip install torch transformers pandas scikit-learn datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [13]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import time
import wandb
import numpy as np

In [14]:
# --- Configuration ---
config = {
    "model_name": 'roberta-base',
    "max_len": 256,
    "batch_size": 16,
    "epochs": 3,
    "learning_rate": 2e-5,
    "test_set_size": 0.2,
    "random_state": 42,
    "num_labels": 2,
    "wandb_project": "fake-news-roberta-classification-v2",
    "wandb_entity": None,
    "csv_filename": "train.csv"
}

In [15]:
import os
os.environ["WANDB_API_KEY"] = "7bb63202e57cfbdea2fb28ef9f2c7b78b422c69d"

In [16]:
import wandb
wandb.init(
    project=config["model_name"],
    entity=config["wandb_entity"],
    config=config
)
print("W&B Initialized successfully.")

W&B Initialized successfully.


In [17]:
%pwd

'/data/yaohli/workspace/matheval_projects/datasets/nlpclass/NLPGroup12'

In [22]:
from datasets import load_dataset, ClassLabel
from transformers import RobertaTokenizerFast

raw_dataset = load_dataset('csv', data_files=config['csv_filename'], split='train')

def combine_text_features(examples):
    titles = [str(t) if t is not None else "" for t in examples['title']]
    texts = [str(t) if t is not None else "" for t in examples['text']]
    examples['full_text'] = [title + " [SEP] " + text for title, text in zip(titles, texts)]
    return examples

raw_dataset = raw_dataset.map(combine_text_features, batched=True, remove_columns=['title', 'text'])


In [23]:
print("Tokenizing data...")
tokenizer = RobertaTokenizerFast.from_pretrained(config['model_name'])

def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True, padding=False, max_length=config['max_len'])

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['full_text'])


Tokenizing data...


In [24]:
if 'label' in tokenized_dataset.column_names and 'labels' not in tokenized_dataset.column_names:
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

tokenized_dataset = tokenized_dataset.cast_column("labels", ClassLabel(num_classes=config['num_labels'], names=['Fake (0)', 'Real (1)']))

print("Splitting dataset...")
split_dataset = tokenized_dataset.train_test_split(
    test_size=config["test_set_size"],
    seed=config["random_state"],
    stratify_by_column="labels"
)

split_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

Casting the dataset: 100%|██████████| 57106/57106 [00:00<00:00, 88244.66 examples/s] 

Splitting dataset...





In [25]:
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

label_names = ['Fake (0)', 'Real (1)']

print(f"{len(train_dataset)} train / {len(test_dataset)} test samples")

45684 train / 11422 test samples


In [26]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    RobertaForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [27]:
model = RobertaForSequenceClassification.from_pretrained(
    config['model_name'],
    num_labels=config['num_labels']
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=config['learning_rate'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Creating dataloaders...")
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    collate_fn=data_collator
)
test_loader = DataLoader(
    test_dataset,
    batch_size=config['batch_size'],
    collate_fn=data_collator
)

total_steps = len(train_loader) * config['epochs']
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Creating dataloaders...




In [28]:
import torch
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import time
import wandb
import numpy as np
import os # Import os for directory handling

# Assume 'config', 'model', 'train_loader', 'test_loader', 'optimizer', 'scheduler', 'device', 'label_names', 'tokenizer' are defined

# --- Model Saving Setup ---
model_save_path = f"./{config['model_name']}_checkpoints"
os.makedirs(model_save_path, exist_ok=True) # Create directory if it doesn't exist
print(f"Model checkpoints will be saved to: {model_save_path}")
# --- End Model Saving Setup ---


print(f"Starting training for {config['epochs']} epochs...")
overall_start_time = time.time()

best_val_accuracy = 0.0
# Variables to store results from the last epoch for final reporting
final_predictions = []
final_true_labels = []
final_avg_eval_loss = 0.0
final_accuracy = 0.0


for epoch in range(config['epochs']):
    epoch_start_time = time.time()
    print(f"\n--- Epoch {epoch + 1}/{config['epochs']} ---")

    # --- Training ---
    model.train()
    total_train_loss = 0
    for batch_num, batch in enumerate(train_loader):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        if loss is None or torch.isnan(loss) or torch.isinf(loss):
            # Basic check for invalid loss
            print(f"Warning: Invalid loss in train batch {batch_num}. Skipping.")
            continue

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        if wandb.run:
             wandb.log({"train_loss_step": loss.item(), "learning_rate_step": optimizer.param_groups[0]['lr']})

        # Print less frequently
        if (batch_num + 1) % 100 == 0:
             print(f"  Train Batch {batch_num + 1}/{len(train_loader)}")

    avg_train_loss = total_train_loss / len(train_loader) if len(train_loader) > 0 else 0.0

    # --- Evaluation ---
    print(f"--- Evaluating Epoch {epoch + 1} ---")
    model.eval()
    epoch_predictions = []
    epoch_true_labels = []
    total_eval_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            if loss is not None:
                total_eval_loss += loss.item()

            batch_preds = torch.argmax(logits, dim=1)
            epoch_predictions.extend(batch_preds.cpu().tolist())
            epoch_true_labels.extend(labels.cpu().tolist())

    avg_eval_loss = total_eval_loss / len(test_loader) if len(test_loader) > 0 else 0.0

    # --- Calculate Metrics ---
    if not epoch_true_labels or not epoch_predictions:
        print("Warning: No evaluation predictions made this epoch.")
        accuracy, precision_weighted, recall_weighted, f1_weighted = 0.0, 0.0, 0.0, 0.0
        precision_macro, recall_macro, f1_macro = 0.0, 0.0, 0.0
    else:
        accuracy = accuracy_score(epoch_true_labels, epoch_predictions)
        precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
            epoch_true_labels, epoch_predictions, average='weighted', zero_division=0
        )
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            epoch_true_labels, epoch_predictions, average='macro', zero_division=0
        )

    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_eval_loss:.4f} | Val Acc: {accuracy:.4f} | Time: {epoch_time:.2f}s")

    # --- Log epoch metrics to W&B ---
    if wandb.run:
        wandb.log({
            "epoch": epoch + 1,
            "avg_train_loss": avg_train_loss,
            "avg_val_loss": avg_eval_loss,
            "val_accuracy": accuracy,
            "val_precision_weighted": precision_weighted,
            "val_recall_weighted": recall_weighted,
            "val_f1_weighted": f1_weighted,
            "val_precision_macro": precision_macro,
            "val_recall_macro": recall_macro,
            "val_f1_macro": f1_macro,
            "epoch_duration_sec": epoch_time,
        }, step=epoch + 1)

    # --- Check for improvement and Save Model ---
    if accuracy > best_val_accuracy:
        print(f"  New best val accuracy: {accuracy:.4f}. Saving model to {model_save_path}")
        best_val_accuracy = accuracy
        # Save the model and tokenizer
        model.save_pretrained(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        # --- End Model Saving ---

    # Store results from the last epoch for final report outside the loop
    if epoch == config['epochs'] - 1:
        final_predictions = epoch_predictions
        final_true_labels = epoch_true_labels
        final_avg_eval_loss = avg_eval_loss
        final_accuracy = accuracy


# --- End of Training Loop ---
total_training_time = time.time() - overall_start_time
print(f"\nTotal Training Time: {total_training_time:.2f}s")



Model checkpoints will be saved to: ./roberta-base_checkpoints
Starting training for 3 epochs...

--- Epoch 1/3 ---
  Train Batch 100/2856
  Train Batch 200/2856
  Train Batch 300/2856
  Train Batch 400/2856
  Train Batch 500/2856
  Train Batch 600/2856
  Train Batch 700/2856
  Train Batch 800/2856
  Train Batch 900/2856
  Train Batch 1000/2856
  Train Batch 1100/2856
  Train Batch 1200/2856
  Train Batch 1300/2856
  Train Batch 1400/2856
  Train Batch 1500/2856
  Train Batch 1600/2856
  Train Batch 1700/2856
  Train Batch 1800/2856
  Train Batch 1900/2856
  Train Batch 2000/2856
  Train Batch 2100/2856
  Train Batch 2200/2856
  Train Batch 2300/2856
  Train Batch 2400/2856
  Train Batch 2500/2856
  Train Batch 2600/2856
  Train Batch 2700/2856
  Train Batch 2800/2856
--- Evaluating Epoch 1 ---
Epoch 1 | Train Loss: 0.1617 | Val Loss: 0.1398 | Val Acc: 0.9610 | Time: 992.83s
  New best val accuracy: 0.9610. Saving model to ./roberta-base_checkpoints

--- Epoch 2/3 ---
  Train Batch 100



In [29]:
from sklearn.metrics import classification_report
import wandb
import numpy as np

if not final_true_labels or not final_predictions:
     final_report_string = "N/A - No predictions available."
else:
    final_report_string = classification_report(
        final_true_labels,
        final_predictions,
        target_names=label_names,
        zero_division=0,
        digits=4
    )
print(final_report_string)

              precision    recall  f1-score   support

    Fake (0)     0.9987    0.9308    0.9636      6000
    Real (1)     0.9288    0.9987    0.9625      5422

    accuracy                         0.9631     11422
   macro avg     0.9638    0.9648    0.9630     11422
weighted avg     0.9656    0.9631    0.9631     11422

