In [42]:
import json
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import random

In [43]:
label_mapping = {'False': 0, 'True': 1}
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        text = item['Utterance']
        label = item['Instruction']
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label_mapping[label], dtype=torch.long)
        }

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = load_data('../data/clean_dataset/train_set.json')
dev_data = load_data('../data/clean_dataset/dev_set.json')
test_data = load_data('../data/clean_dataset/test_set.json')

tokenizer = BertTokenizer.from_pretrained('gaunernst/bert-tiny-uncased', do_lower_case=True)
train_dataset = CustomDataset(train_data, tokenizer, max_length=128)
dev_dataset = CustomDataset(dev_data, tokenizer, max_length=128)
test_dataset = CustomDataset(test_data, tokenizer, max_length=128)


In [45]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [46]:
model = BertForSequenceClassification.from_pretrained('gaunernst/bert-tiny-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gaunernst/bert-tiny-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()


    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [48]:
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [49]:
#optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-8)
total_steps = len(train_loader) * 4  # 3 epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

best_accuracy = 0

for epoch in range(4):
    print(f'Epoch {epoch + 1}/{4}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        dev_loader,
        device
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')




Epoch 1/3
----------
Train loss 0.09100570865493363 accuracy 0.9795460137795275
Val   loss 0.012798317657315216 accuracy 0.9977546751968503
Epoch 2/3
----------
Train loss 0.010139924208917765 accuracy 0.9980622539370079
Val   loss 0.0069673043521837055 accuracy 0.99876968503937
Epoch 3/3
----------
Train loss 0.0050993144080803895 accuracy 0.9988927165354331
Val   loss 0.003338882358612742 accuracy 0.9993848425196851
Epoch 4/3
----------
Train loss 0.0035769417067287547 accuracy 0.9993848425196851
Val   loss 0.0027863462205925146 accuracy 0.9994771161417323


In [50]:
model.load_state_dict(torch.load('best_model_state.bin'))
test_acc, _ = eval_model(model, test_loader, device)
print(f'Test Accuracy: {test_acc}')

  model.load_state_dict(torch.load('best_model_state.bin'))


Test Accuracy: 0.9994771161417323


In [51]:
def predict(text, model, tokenizer, device):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output.logits, dim=1)

    return prediction.item()


In [55]:
# Example prediction
example_text = "set a timer at 13:00"
print(f'Text: {example_text}')
print(f'Prediction: {"Instruction" if predict(example_text, model, tokenizer, device) == 1 else "Not Instruction"}')

Text: set a timer at 13:00
Prediction: Not Instruction
