In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json



In [None]:
# Load the dataset
with open('filtered_utterances_ft_data.json', 'r') as file:
    data = json.load(file)
    
# Load the lists of pragmatic markers
with open('hedging_markers.json', 'r') as file:
    hedging_markers = json.load(file)

with open('authority_markers.json', 'r') as file:
    authority_markers = json.load(file)

# Define label mapping
label_dict = {'O': 0, 'B-authority': 1, 'I-authority': 2, 'B-hedge': 3, 'I-hedge': 4}
label_dict_inv = {v: k for k, v in label_dict.items()}

In [None]:
# Prepare the dataset for token classification
class UtteranceTokenDataset(Dataset):
    def __init__(self, entries, hedging_markers, authority_markers):
        self.entries = entries
        self.hedging_markers = set(hedging_markers)
        self.authority_markers = set(authority_markers)
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries[idx]
        tokens = self.tokenizer.tokenize(entry['statement'])
        labels = ['O'] * len(tokens)  # Default label is 'O'

        # Apply labels based on hedging and authority markers
        for i, token in enumerate(tokens):
            if token in self.hedging_markers:
                labels[i] = 'B-hedge'  # Beginning of a hedging marker
            elif token in self.authority_markers:
                labels[i] = 'B-authority'  # Beginning of an authority marker

        label_ids = [label_dict[label] for label in labels]
        
        encoding = self.tokenizer(entry['statement'], truncation=True, padding='max_length', max_length=512, is_split_into_words=True)
        encoding['labels'] = label_ids
        return encoding

In [None]:
# Convert the data into a PyTorch Dataset
dataset = UtteranceTokenDataset(data, hedging_markers, authority_markers)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Load BERT for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()