In [1]:
import json

label_to_id_dict = {
    "general_information": 0,
    "account_help": 1,
    "troubleshoot_product": 2,
    "lookup_report": 3,
    "lookup_person": 4,
}

id_to_label_dict = {
    0: "general_information",
    1: "account_help",
    2: "troubleshoot_product",
    3: "lookup_report",
    4: "lookup_person",
}

def load_dataset(file):
    data, dataset = [], []
    with open(file, "r") as f:
        for line in f:
            data.append(json.loads(line))
    for sent in data:
        dataset.append(
            {
                "text": sent["text"], 
                "label": label_to_id_dict[sent["label"]],
            }
        )
    return dataset

In [2]:
train_data = load_dataset("../../data/7k_sentences_train.jsonl")
eval_data = load_dataset("../../data/7k_sentences_eval.jsonl")

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [4]:
# Importing the libraries needed
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

class TextClassificationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        data = self.dataset[index]
        text = data["text"]
        label = data["label"]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return len(self.dataset)

In [5]:
MAX_LEN=512
training_set = TextClassificationDataset(train_data, tokenizer, MAX_LEN)
testing_set = TextClassificationDataset(eval_data, tokenizer, MAX_LEN)

In [6]:
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [7]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [8]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [9]:
LEARNING_RATE = 1e-05

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [10]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [11]:
def validation(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [12]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
from tqdm.notebook import tqdm

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    i=0
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        if i%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Validation Loss per 100 steps: {loss_step}")
            print(f"Validation Accuracy per 100 steps: {accu_step}")
        i+=1

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    acc = validation(model, testing_loader)
    return model, acc

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    model, acc = train(epoch)
    torch.save(model.state_dict(), f"./checkpoints/distilbert-7k-epoch-{epoch}-val-acc-{acc:0.2f}")

## Eval

In [14]:
acc = validation(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss per 100 steps: 2.032283067703247
Validation Accuracy per 100 steps: 0.0
Validation Loss Epoch: 1.5748355927921476
Validation Accuracy Epoch: 34.523809523809526
Accuracy on test data = 34.52%


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = DistillBERTClass()
model.load_state_dict(torch.load("checkpoints/distilbert-7k-epoch-6-val-acc-88.0952", weights_only=True))
model.to("cuda")
model.eval()

In [18]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs.to("cpu"))
    predicted_class_id = logits.argmax().item()
    return predicted_class_id

In [19]:
wrong = 0
for data in eval_data:
    text = data["text"]
    label = id_to_label_dict[data["label"]]
    prediction = id_to_label_dict[predict(text)]
    if prediction != label:
        print(f"Sentence: {text}")
        print("Predict:", prediction, "| Ground truth:", label)
        print("-"*80)
        wrong += 1

print("="*80)
print(f"Total wrong predictions: {wrong}/{len(eval_data)}")

Sentence: Show me reports about Hikvision.
Predict: lookup_person | Ground truth: lookup_report
--------------------------------------------------------------------------------
Sentence: Find the article on Axis cameras.
Predict: lookup_person | Ground truth: lookup_report
--------------------------------------------------------------------------------
Sentence: Get me the latest post on cybersecurity.
Predict: lookup_person | Ground truth: lookup_report
--------------------------------------------------------------------------------
Sentence: Give me reports on cloud computing.
Predict: lookup_person | Ground truth: lookup_report
--------------------------------------------------------------------------------
Sentence: Where is the report about AI trends?
Predict: lookup_person | Ground truth: lookup_report
--------------------------------------------------------------------------------
Sentence: Fetch the post on video surveillance.
Predict: lookup_person | Ground truth: lookup_repor

In [20]:
wrong / len(eval_data)

0.8452380952380952