In [83]:
import pandas as pd
import torch
from torch.optim import AdamW  # Import PyTorch's AdamW

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

In [84]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [85]:
os.chdir('/content/drive/MyDrive/ADL-Hallucination-Detection')

In [86]:
# Load the datasets
train_data = pd.read_csv("cnndm/train_data_base.csv", sep=';')
#valid_data = pd.read_csv("cnndm/valid_data_base.csv", sep=';')
#if test_data = pd.read_csv("cnndm/test_data_base.csv", sep=';')
valid_data = pd.read_csv("cnndm/train_data_base.csv", sep=';')
test_data = pd.read_csv("cnndm/train_data_base.csv", sep=';')


In [87]:
print(train_data.isna().sum())
missing_rows = train_data[train_data.isna().any(axis=1)]


Unnamed: 0    0
article       0
highlights    0
label         0
dtype: int64


In [75]:
#missing_rows

Unnamed: 0.1,Unnamed: 0,article,highlights,label


In [88]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,article,highlights,label
0,1,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,0
1,2,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,0
2,3,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",0
3,4,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",0
4,5,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",0


In [77]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [111]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        

        self._create_examples()

    def _create_examples(self):
        self.skipped_count = 0

        # For each row in the dataset
        for _, row in self.data.iterrows():
            doc, summ, label = row['article'], row['highlights'], row['label']

            # Tokenize the document and summary
            doc_tokens = self.tokenizer.tokenize(doc)
            summ_tokens = self.tokenizer.tokenize(summ)

            # Calculate space available for the summary tokens
            available_for_summary = self.max_length - len(doc_tokens) - 3  # [CLS] doc_tokens [SEP] summ_tokens [SEP]
            if available_for_summary <= 0:
                # Skip examples where the document alone exceeds max_length
                self.skipped_count += 1
                continue

            start = 0
            while start < len(summ_tokens):
                end = min(start + available_for_summary, len(summ_tokens))
                chunk = summ_tokens[start:end]

                # Create input_ids and attention_mask
                input_ids = [self.tokenizer.cls_token_id] + \
                            self.tokenizer.convert_tokens_to_ids(doc_tokens) + \
                            [self.tokenizer.sep_token_id] + \
                            self.tokenizer.convert_tokens_to_ids(chunk) + \
                            [self.tokenizer.sep_token_id]

                attention_mask = [1] * len(input_ids)

                # Pad if necessary
                if len(input_ids) < self.max_length:
                    pad_length = self.max_length - len(input_ids)
                    input_ids += [self.tokenizer.pad_token_id] * pad_length
                    attention_mask += [0] * pad_length

                self.examples.append({
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "label": label
                })

                start = end  # Move to the next chunk of summary tokens

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        return {
            "input_ids": torch.tensor(example["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(example["attention_mask"], dtype=torch.long),
            "label": torch.tensor(example["label"], dtype=torch.long)
        }


In [112]:
# Parameters
MAX_LEN = 512
BATCH_SIZE = 16

# Prepare datasets
train_dataset = CustomDataset(train_data, tokenizer, max_length=MAX_LEN)
valid_dataset = CustomDataset(valid_data, tokenizer, max_length=MAX_LEN)
test_dataset = CustomDataset(test_data, tokenizer, max_length=MAX_LEN)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [113]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

# Loss function
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        losses.append(loss.item())

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)




In [114]:
def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)


In [115]:
EPOCHS = 3
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Train loss: {train_loss} | Train accuracy: {train_acc}')

    val_acc, val_loss = eval_model(model, valid_loader, criterion, device)
    print(f'Validation loss: {val_loss} | Validation accuracy: {val_acc}')

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')


Epoch 1/3
----------
Train loss: 0.6129013518534744 | Train accuracy: 0.6867244829886591


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('best_model_state.bin'))

test_acc, test_loss = eval_model(model, test_loader, criterion, device)
print(f'Test loss: {test_loss} | Test accuracy: {test_acc}')

# Classification report
all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(all_labels, all_preds))
