# Extended Subproject - Fine Tuning

In [1]:
import pandas as pd
import torch
from torch.optim import AdamW  # Import PyTorch's AdamW

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForTokenClassification , AdamW, BertTokenizerFast
#from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW

from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
# Load the datasets
train_data = pd.read_csv("cnndm/train_data_ext.csv", sep=';')
valid_data = pd.read_csv("cnndm/valid_data_ext.csv", sep=';')
test_data = pd.read_csv("cnndm/test_data_ext.csv", sep=';')

In [3]:
print(train_data.isna().sum())
missing_rows = train_data[train_data.isna().any(axis=1)]
train_data.head()

Unnamed: 0    0
article       0
highlights    0
label         0
dtype: int64


Unnamed: 0.1,Unnamed: 0,article,highlights,label
0,1,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,0
1,2,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,0
2,3,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",0
3,4,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",0
4,5,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",0


In [4]:
train_data.tail()

Unnamed: 0.1,Unnamed: 0,article,highlights,label
13789,69951,"ATLANTA, Georgia (CNN) -- Shoes tell a lot abo...",Some people get [B-hallucinated] rare diseases...,1
13790,69971,"ATLANTA, Georgia (CNN) -- An advisory panel ...",Panel recommends health care workers wear [B-h...,1
13791,69981,Washington (CNN) -- President Obama toasted a ...,NEW: [B-hallucinated]Obama unexpectedly breaks...,1
13792,69991,Editor's note: Tananarive Due is an American B...,Tananarive Due: [B-hallucinated]admits her fam...,1
13793,70001,"(CNN) -- Health officials say the H1N1 virus, ...",CDC reported higher levels of flu activity tha...,1


## Model and Tokenizer Loading

In [5]:
# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("prajjwal1/bert-tiny")
model = BertForTokenClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare Data for Fine-Tuning

In [6]:
class TokenLevelDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        self.skipped_count = 0  # Counter for rows skipped due to document length
        self.skipped_bc_chunk = 0  # Counter for rows skipped due to chunking issues

        self._create_examples()

    def _create_examples(self):
        for _, row in self.data.iterrows():
            doc = row['article']
            summ = row['highlights']

            # Replace [B-hallucinated] and [E-hallucinated] with markers
            summ = summ.replace("[B-hallucinated]", " B_hall ").replace("[E-hallucinated]", " E_hall ")

            # Tokenize the document with offsets
            doc_tokenized = self.tokenizer(doc, padding=False, truncation=False)
            doc_tokens = doc_tokenized["input_ids"]

            # Tokenize the summary with offsets
            summ_tokenized = self.tokenizer(
                summ, padding=False, truncation=False, return_offsets_mapping=True
            )
            summ_tokens = summ_tokenized["input_ids"]
            summ_offsets = summ_tokenized["offset_mapping"]

            if len(doc_tokens) + 3 > self.max_length:  # Check if document alone exceeds the max length
                self.skipped_count += 1
                continue

            # Helper function to check if a chunk fits within max_length
            def chunk_fits(tokens_chunk):
                return len(doc_tokens) + len(tokens_chunk) + 3 <= self.max_length

            # Case 1: Full summary fits
            if chunk_fits(summ_tokens):
                self._add_example(doc_tokens, summ_tokens, summ_offsets)
            else:
                # Case 2: Try splitting into halves
                mid = len(summ_tokens) // 2
                if chunk_fits(summ_tokens[:mid]) and chunk_fits(summ_tokens[mid:]):
                    for chunk, offsets in zip([summ_tokens[:mid], summ_tokens[mid:]],
                                            [summ_offsets[:mid], summ_offsets[mid:]]):
                        self._add_example(doc_tokens, chunk, offsets)
                else:
                    # Case 3: Try splitting into thirds
                    third = len(summ_tokens) // 3
                    chunks = [summ_tokens[:third], summ_tokens[third:2 * third], summ_tokens[2 * third:]]
                    offsets_chunks = [
                        summ_offsets[:third],
                        summ_offsets[third:2 * third],
                        summ_offsets[2 * third:]
                    ]
                    if all(chunk_fits(chunk) for chunk in chunks):
                        for chunk, offsets in zip(chunks, offsets_chunks):
                            self._add_example(doc_tokens, chunk, offsets)
                    else:
                        # Case 4: Skip if none of the strategies work
                        self.skipped_bc_chunk += 1

    def _add_example(self, doc_tokens, summ_tokens, summ_offsets):
        input_ids = [self.tokenizer.cls_token_id] + doc_tokens + \
                    [self.tokenizer.sep_token_id] + summ_tokens + \
                    [self.tokenizer.sep_token_id]

        attention_mask = [1] * len(input_ids)

        # Generate token-level labels
        token_labels = [0] * len(input_ids)  # Initialize all labels as 0
        for idx, (start, end) in enumerate(summ_offsets):
            if start == 0 and end == 0:  # Skip special tokens
                continue
            token_text = self.tokenizer.convert_ids_to_tokens([summ_tokens[idx]])[0]
            if token_text in ["B_hall", "E_hall"]:
                token_labels[idx + len(doc_tokens) + 2] = 1  # Adjust index for document and special tokens

        # Pad if necessary
        if len(input_ids) < self.max_length:
            pad_length = self.max_length - len(input_ids)
            input_ids += [self.tokenizer.pad_token_id] * pad_length
            attention_mask += [0] * pad_length
            token_labels += [-100] * pad_length  # Ignore padded positions in loss computation

        self.examples.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_labels": token_labels
        })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        return {
            "input_ids": torch.tensor(example["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(example["attention_mask"], dtype=torch.long),
            "token_labels": torch.tensor(example["token_labels"], dtype=torch.long)
        }


In [7]:
# Parameters
MAX_LEN = 512 # 2048 # das setzen wir als balance zwischen wie viele padding brauchen und wie viele rausfallen

# Prepare datasets
train_dataset = TokenLevelDataset(train_data, tokenizer, max_length=MAX_LEN)
valid_dataset = TokenLevelDataset(valid_data, tokenizer, max_length=MAX_LEN)
test_dataset = TokenLevelDataset(test_data, tokenizer, max_length=MAX_LEN)


In [8]:
print(f'Train: {train_dataset.skipped_count}/{train_data.shape[0]} ~ {train_dataset.skipped_count/train_data.shape[0] * 100} %')
print(f'Test: {test_dataset.skipped_count}/{test_data.shape[0]} ~ {test_dataset.skipped_count/test_data.shape[0] * 100} %')
print(f'Valid: {valid_dataset.skipped_count}/{valid_data.shape[0]} ~ {valid_dataset.skipped_count/valid_data.shape[0] * 100} %')

Train: 10082/13794 ~ 73.08974916630419 %
Test: 1350/1974 ~ 68.38905775075987 %
Valid: 1412/1966 ~ 71.82095625635809 %


In [9]:
print(f'Train: {train_dataset.skipped_bc_chunk}/{train_data.shape[0]} ~ {train_dataset.skipped_bc_chunk/train_data.shape[0] * 100} %')
print(f'Test: {test_dataset.skipped_bc_chunk}/{test_data.shape[0]} ~ {test_dataset.skipped_bc_chunk/test_data.shape[0] * 100} %')
print(f'Valid: {valid_dataset.skipped_bc_chunk}/{valid_data.shape[0]} ~ {valid_dataset.skipped_bc_chunk/valid_data.shape[0] * 100} %')

Train: 383/13794 ~ 2.7765695229810063 %
Test: 42/1974 ~ 2.127659574468085 %
Valid: 40/1966 ~ 2.034587995930824 %


In [10]:
BATCH_SIZE = 16

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

# Loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)

cuda




In [15]:
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    losses = []
    correct_predictions = 0
    total_tokens = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_labels = batch['token_labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, seq_len, num_labels)

        # Flatten logits and labels for loss computation
        logits = logits.view(-1, logits.shape[-1])  # Shape: (batch_size * seq_len, num_labels)
        labels = token_labels.view(-1)  # Shape: (batch_size * seq_len)

        # Debugging: print shapes
        #print(f"Logits shape: {logits.shape}, Labels shape: {labels.shape}")

        # Compute loss
        loss = criterion(logits, labels)
        losses.append(loss.item())

        # Predictions
        preds = torch.argmax(logits, dim=1)  # Shape: (batch_size * seq_len)
        mask = labels != -100  # Ignore padded labels
        correct_predictions += torch.sum((preds == labels) & mask)
        total_tokens += torch.sum(mask)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Avoid division by zero
    if total_tokens == 0:
        raise ValueError("Total tokens processed is zero. Check your token labels or dataset.")

    accuracy = correct_predictions.double() / total_tokens
    return accuracy, sum(losses) / len(losses)


In [16]:
def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_tokens = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_labels = batch['token_labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Shape: (batch_size, seq_len, num_labels)

            # Flatten logits and labels for loss computation
            logits = logits.view(-1, logits.shape[-1])  # Shape: (batch_size * seq_len, num_labels)
            labels = token_labels.view(-1)  # Shape: (batch_size * seq_len)

            # Compute loss
            loss = criterion(logits, labels)
            losses.append(loss.item())

            # Predictions
            preds = torch.argmax(logits, dim=1)  # Shape: (batch_size * seq_len)
            mask = labels != -100  # Ignore padded labels
            correct_predictions += torch.sum((preds == labels) & mask)
            total_tokens += torch.sum(mask)

            # Collect predictions and labels for evaluation metrics
            all_preds.extend(preds[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())

    accuracy = correct_predictions.double() / total_tokens

    return accuracy, sum(losses) / len(losses)


In [17]:
torch.cuda.empty_cache()

In [18]:
EPOCHS = 20  # Maximum number of epochs
PATIENCE = 5  # Number of epochs to wait for improvement
best_val_loss = float('inf')  # Initialize with a large value
early_stopping_counter = 0  # Tracks epochs without improvement

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)

    # Train for one epoch
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Train Token Accuracy: {train_acc:.4f}")

    # Evaluate on validation set
    val_acc, val_loss = eval_model(model, valid_loader, criterion, device)
    print(f"Validation Loss: {val_loss:.4f}, Validation Token Accuracy: {val_acc:.4f}")

    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0  # Reset counter if performance improves
        torch.save(model.state_dict(), 'models/best_model_state_token.bin')  # Save the best model
        print("Validation performance improved. Model saved.")
    else:
        early_stopping_counter += 1  # Increment counter if no improvement
        print(f"No improvement. Early stopping counter: {early_stopping_counter}/{PATIENCE}")

    # Stop training if early stopping criteria are met
    if early_stopping_counter >= PATIENCE:
        print("Early stopping triggered. Training stopped.")
        break

Epoch 1/20
----------
Train Loss: 0.0057, Train Token Accuracy: 1.0000
Validation Loss: 0.0038, Validation Token Accuracy: 1.0000
Validation performance improved. Model saved.
Epoch 2/20
----------
Train Loss: 0.0043, Train Token Accuracy: 1.0000
Validation Loss: 0.0029, Validation Token Accuracy: 1.0000
Validation performance improved. Model saved.
Epoch 3/20
----------
Train Loss: 0.0034, Train Token Accuracy: 1.0000
Validation Loss: 0.0023, Validation Token Accuracy: 1.0000
Validation performance improved. Model saved.
Epoch 4/20
----------
Train Loss: 0.0026, Train Token Accuracy: 1.0000
Validation Loss: 0.0018, Validation Token Accuracy: 1.0000
Validation performance improved. Model saved.
Epoch 5/20
----------
Train Loss: 0.0021, Train Token Accuracy: 1.0000
Validation Loss: 0.0014, Validation Token Accuracy: 1.0000
Validation performance improved. Model saved.
Epoch 6/20
----------
Train Loss: 0.0017, Train Token Accuracy: 1.0000
Validation Loss: 0.0012, Validation Token Accurac

In [22]:
# Load the best model for evaluation on the test set
model.load_state_dict(torch.load('models/best_model_state_token.bin'))
# Load the best model for evaluation on the test set
model.load_state_dict(torch.load('models/best_model_state_token.bin'))

test_acc, test_loss = eval_model(model, test_loader, criterion, device)
print(f'Test loss: {test_loss} | Test accuracy: {test_acc}')

# Initialize lists for predictions and true labels
all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        # Move inputs and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_labels = batch['token_labels'].to(device)  # Token-level labels

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, seq_len, num_labels)

        # Get predictions and flatten them
        preds = torch.argmax(logits, dim=-1)  # Shape: (batch_size, seq_len)
        flattened_preds = preds.view(-1).cpu().numpy()

        # Flatten labels and apply the attention mask
        flattened_labels = token_labels.view(-1).cpu().numpy()
        attention_mask_flat = attention_mask.view(-1).cpu().numpy()

        # Mask predictions and labels to exclude padding (-100)
        valid_preds = flattened_preds[flattened_labels != -100]
        valid_labels = flattened_labels[flattened_labels != -100]

        # Append valid predictions and labels for metrics
        all_preds.extend(valid_preds)
        all_labels.extend(valid_labels)

# Generate a classification report
print(classification_report(all_labels, all_preds, target_names=["Non-Hallucinated", "Hallucinated"]))

  model.load_state_dict(torch.load('models/best_model_state_token.bin'))
  model.load_state_dict(torch.load('models/best_model_state_token.bin'))


Test loss: 0.00010145309158057595 | Test accuracy: 1.0


ValueError: Number of classes, 1, does not match size of target_names, 2. Try specifying the labels parameter