In [1]:
import json

label_to_id_dict = {
    "general_information": 0,
    "account_help": 1,
    "troubleshoot_product": 2,
    "lookup_report": 3,
    "lookup_person": 4,
}

id_to_label_dict = {
    0: "general_information",
    1: "account_help",
    2: "troubleshoot_product",
    3: "lookup_report",
    4: "lookup_person",
}

def load_dataset(file):
    data, dataset = [], []
    with open(file, "r") as f:
        for line in f:
            data.append(json.loads(line))
    for sent in data:
        dataset.append(
            {
                "text": sent["text"], 
                "label": label_to_id_dict[sent["label"]],
            }
        )
    return dataset

In [2]:
train_data = load_dataset("../eda/7k_sentences_train.jsonl")
eval_data = load_dataset("../eda/7k_sentences_eval.jsonl")

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# import numpy as np

# def compute_accuracy(predictions, labels):
#     acc_preds = 0
#     for pred, label in zip(predictions, labels):
#         if pred == label:
#             acc_preds += 1
#     return round(acc_preds / len(predictions), 4)

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return compute_accuracy(predictions=predictions, references=labels)

In [24]:
# Importing the libraries needed
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

class TextClassificationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        data = self.dataset[index]
        text = data["text"]
        label = data["label"]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return len(self.dataset)

In [25]:
MAX_LEN=512
training_set = TextClassificationDataset(train_data, tokenizer, MAX_LEN)
testing_set = TextClassificationDataset(eval_data, tokenizer, MAX_LEN)

In [26]:
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [18]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [19]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [20]:
LEARNING_RATE = 1e-05

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [21]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [28]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
from tqdm.notebook import tqdm

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    i = 0
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if i%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    i += 1

    return 

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 0/1925 [00:00<?, ?it/s]

Training Loss per 5000 steps: 0.3291453719139099
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.3398304432630539
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.5230689148108164
Training Accuracy per 5000 steps: 91.66666666666667
Training Loss per 5000 steps: 0.6216069832444191
Training Accuracy per 5000 steps: 87.5
Training Loss per 5000 steps: 0.5631775975227356
Training Accuracy per 5000 steps: 90.0
Training Loss per 5000 steps: 0.5144612739483515
Training Accuracy per 5000 steps: 91.66666666666667
Training Loss per 5000 steps: 0.48559769136565073
Training Accuracy per 5000 steps: 92.85714285714286
Training Loss per 5000 steps: 0.5266474820673466
Training Accuracy per 5000 steps: 90.625
Training Loss per 5000 steps: 0.6487840579615699
Training Accuracy per 5000 steps: 86.11111111111111
Training Loss per 5000 steps: 0.6088391542434692
Training Accuracy per 5000 steps: 87.5
Training Loss per 5000 steps: 0.6307698054747148
Training Accur

## Eval

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)