In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load the datasets
train_data = pd.read_csv("train_data_base.csv", sep=';')
valid_data = pd.read_csv("valid_data_base.csv", sep=';')
test_data = pd.read_csv("test_data_base.csv", sep=';')

# Prepare the datasets for document-level classification
train_texts = train_data.apply(lambda x: f"Source: {x['src']}\nSummary: {x['tgt']}", axis=1).tolist()
train_labels = train_data['label'].tolist()

valid_texts = valid_data.apply(lambda x: f"Source: {x['src']}\nSummary: {x['tgt']}", axis=1).tolist()
valid_labels = valid_data['label'].tolist()

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

# Define a custom dataset class
class SummaryDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create DataLoader objects
train_dataset = SummaryDataset(train_texts, train_labels, tokenizer)
valid_dataset = SummaryDataset(valid_texts, valid_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

# Define the optimizer and device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss / len(train_loader):.4f}")

    # Evaluation
    model.eval()
    valid_loss = 0
    valid_preds = []
    valid_labels_true = []
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            valid_loss += outputs.loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            valid_preds.extend(preds)
            valid_labels_true.extend(labels.cpu().numpy())

    accuracy = accuracy_score(valid_labels_true, valid_preds)
    print(f"Validation Loss: {valid_loss / len(valid_loader):.4f}, Accuracy: {accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("bert_tiny_finetuned")
tokenizer.save_pretrained("bert_tiny_finetuned")

# Evaluate with classification report
print(classification_report(valid_labels_true, valid_preds, target_names=['Real', 'Hallucinated']))
