In [4]:
import json
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import random

In [20]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        text = item['Utterance']
        label = int(item['Instruction'])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(file_path):
    with open(file_path, 'r') asº f:
        data = json.load(f)
    return data

train_data = load_data('../data/clean_dataset/train_set.json')
dev_data = load_data('../data/clean_dataset/dev_set.json')
test_data = load_data('../data/clean_dataset/test_set.json')

tokenizer = BertTokenizer.from_pretrained('gaunernst/bert-tiny-uncased', do_lower_case=True)
train_dataset = CustomDataset(train_data, tokenizer, max_length=128)
dev_dataset = CustomDataset(dev_data, tokenizer, max_length=128)
test_dataset = CustomDataset(test_data, tokenizer, max_length=128)


In [23]:
train_dataset[0]

{'text': 'which is the most common use of opt-in e-mail marketing',
 'input_ids': tensor([  101,  2029,  2003,  1996,  2087,  2691,  2224,  1997, 23569,  1011,
          1999,  1041,  1011,  5653,  5821,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,

In [3]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [7]:
model = BertForSequenceClassification.from_pretrained('gaunernst/bert-tiny-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gaunernst/bert-tiny-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()


    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [9]:
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [10]:
#optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-5)
total_steps = len(train_loader) * 3  # 3 epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

best_accuracy = 0

for epoch in range(3):
    print(f'Epoch {epoch + 1}/{3}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        dev_loader,
        device
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')




Epoch 1/3
----------
Train loss 0.03390181297149902 accuracy 0.9923412893700787
Val   loss 0.30487623300786665 accuracy 0.943795326349718
Epoch 2/3
----------
Train loss 0.006629052983244467 accuracy 0.9986466535433071
Val   loss 0.4129971194695277 accuracy 0.9329170024174053
Epoch 3/3
----------
Train loss 0.0041403455769201585 accuracy 0.9992310531496063
Val   loss 0.5052371510052589 accuracy 0.9214343271555197


In [11]:
model.load_state_dict(torch.load('best_model_state.bin'))
test_acc, _ = eval_model(model, test_loader, device)
print(f'Test Accuracy: {test_acc}')

  model.load_state_dict(torch.load('best_model_state.bin'))


Test Accuracy: 0.9507260176148536


In [12]:
def predict(text, model, tokenizer, device):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output.logits, dim=1)

    return prediction.item()


In [19]:
# Example prediction
example_text = "instruction"
print(f'Text: {example_text}')
print(f'Prediction: {"Instruction" if predict(example_text, model, tokenizer, device) else "Not Instruction"}')

Text: instruction
Prediction: Not Instruction
