In [47]:
from transformers import RobertaTokenizer
from transformers import RobertaTokenizer, RobertaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

from sklearn.preprocessing import normalize
import pandas as pd
from tqdm import tqdm
from torch import nn
import random as rnd
import torch
import ast
import numpy as np
import re

In [48]:

def extract_sentences():

    # Extract src data
    source_data = []
    top_data = []
    top_decoupled_data = []

    # Read source.txt
    with open('../dataset/source.txt', 'r') as file:
        source_data = ast.literal_eval(file.read())

    # Read labels.txt
    with open("../dataset/input_labels.txt", 'r') as file:
        labels = file.read().splitlines()

    return source_data, labels

In [49]:
def read_unique_labels(file_path):
    labels = []
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

In [36]:
def read_dev_dataset(file_path):

    with open(file_path, 'r') as file:
        text = file.read()
        
    pattern_src = r'(?<="dev\.SRC": ").+(?=", "dev\.EXR")'
    pattern_top = r'(?<="dev\.TOP": ").+(?=", "dev\.PCFG_ERR")'
    test_src = re.finditer(pattern_src, text)
    test_top = re.finditer(pattern_top, text)
    test_src_arr = []
    test_top_decoupled_arr = []

    for match in test_src:
        test_src_arr.append(match.group())
    
    pattern_top_decoupled = r'(?<=\))[\w ]*(?= \()|(?<=ORDER)[\w ]*(?= \()|(?<=PIZZAORDER)[\w ]*(?= \()|(?<=DRINKORDER)[\w ]*(?= \()'
    for match in test_top:
        temp = re.sub(pattern_top_decoupled,'',match.group())
        test_top_decoupled_arr.append(temp)

    return test_src_arr, test_top_decoupled_arr

#read_dev_dataset("../dataset/PIZZA_dev.json")

In [68]:
def parse_tc(train_SRC,train_TOP):
    
    def parse_sexp(s):
        s = s.replace('(', ' ( ').replace(')', ' ) ')
        tokens = s.split()
        def helper(tokens):
            token = tokens.pop(0)
            if token == '(':
                L = []
                while tokens[0] != ')':
                    L.append(helper(tokens))
                tokens.pop(0)
                return L
            else:
                return token
        return helper(tokens.copy())

    tree = parse_sexp(train_TOP)

    entities = []

    def extract_entities(tree, current_label=None, text_accumulator=[]):
        if isinstance(tree, list):
            label = tree[0]
            content = tree[1:]
            text = []
            for item in content:
                extract_entities(item, label, text)
            entity_text = ' '.join(text)
            #if label in ['ORDER', 'PIZZAORDER', 'NOT'] or label not in ['NUMBER']:
            match = re.search(re.escape(entity_text), train_SRC)
            if match:
                if label == "NOT":
                    temp_entity = entities.pop()
                    entities.append({
                    'label': label+"-"+temp_entity['label'],
                    'word': match.group(),
                    })

                else:
                    entities.append({
                        'label': label,
                        'word': match.group(),
                    })
            text_accumulator.extend(text)
        else:
            text_accumulator.append(tree)

    extract_entities(tree)

    result = {
        'sentence': train_SRC,
        'entities': entities
    }
    #print(result)
    return result

def generate_bio_tags(sentence, entities):
    
    words = sentence.split()
    bio_tags = ["0"] * len(words)  
    
    
    for entity in entities:
        label = entity['label'] 
        entity_words = entity['word'].split()  
        
        
        if label in ['PIZZAORDER', 'ORDER', 'DRINKORDER','COMPLEX_TOPPING']:
            continue
        
        for i in range(len(words)):
            if words[i:i+len(entity_words)] == entity_words:
                bio_tags[i] = f"B-{label}"  
                for j in range(1, len(entity_words)):
                    bio_tags[i+j] = f"I-{label}"  
                break  
    
    return list(zip(words, bio_tags))

In [None]:
def create_test_labels_input():
    longest_sentence = 0
    unique_words = set()
    result = []
    tags = []
    ut_labels = read_unique_labels('./unique_labels.txt')
    t_labels = {}
    t_labels['0'] = 0
    for i in range(len(ut_labels)):
        t_labels[ut_labels[i]] = i+1
    
    test_SRC, test_TOP_DECOUPLED = read_dev_dataset("../dataset/PIZZA_dev.json")
    test_SRC_size = len(test_SRC)
    print(test_SRC[0])
    print(len(test_SRC))
    print(test_TOP_DECOUPLED[0])
    print(len(test_TOP_DECOUPLED))

    with open('../dataset/test_input_labels.txt', 'w') as f:
        for i in range(test_SRC_size):
            test_SRC_item = test_SRC[i]
            test_TOP_DECOUPLED_item = test_TOP_DECOUPLED[i]
            longest_sentence = max(len(test_SRC_item.split()), longest_sentence)    
            unique_words.update(test_SRC_item.split())            

            result.append(parse_tc(test_SRC_item,test_TOP_DECOUPLED_item))

            tags.append(generate_bio_tags(result[i]['sentence'], result[i]['entities']))

            test_SRC_labels_list = []
            for word, tag in tags[i]:
                test_SRC_labels_list.append(tag)
                f.write(f"{tag} ")
            f.write("\n")
    print("longest sentence:",longest_sentence)

create_test_labels_input()


i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage
348
(ORDER (PIZZAORDER (NUMBER two ) (SIZE medium ) (TOPPING sausage ) (TOPPING black olives ) ) (PIZZAORDER (NUMBER two ) (SIZE medium ) (TOPPING pepperoni ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) (PIZZAORDER (NUMBER three ) (SIZE large ) (TOPPING pepperoni ) (TOPPING sausage ) ) )
348
longest sentence: 33


KeyError: 

In [46]:
def extract_word2vec_embeddings(data, device='cuda'):
    """
    Extract Word2Vec embeddings
    """
    sentences = [sentence.split() for sentence in data]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Convert to GPU tensor if possible
    embeddings = []
    for sentence in sentences:
        # Get word vectors, use zeros if word not in vocabulary
        sent_embedding = [model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)]
        embedding = torch.tensor(np.mean(sent_embedding, axis=0), device=device)
        embeddings.append(embedding)
    
    return torch.stack(embeddings)

def extract_contextual_embeddings(data, device='cuda'):
    """
    Extract RoBERTa contextual embeddings
    """
    # Move model to GPU if available
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    
    embeddings = []
    for sentence in data:
        # Tokenize and move to GPU
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling and move back to CPU if needed
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(embedding.cpu().numpy())
    
    return np.array(embeddings)

def extract_lda_features(data, n_topics=2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda_features = lda.fit_transform(tfidf_matrix)
    return lda_features

In [51]:
# def extract_word2vec_embeddings(data):
#     sentences = [sentence.split() for sentence in data]
#     model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#     embeddings = [np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in sentences]
#     return np.array(embeddings)


# def extract_contextual_embeddings(data):
#     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#     model = RobertaModel.from_pretrained('roberta-base')
#     embeddings = []
#     for sentence in data:
#         inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
#     return np.array(embeddings)

# def extract_lda_features(data, n_topics=2):
#     """Extract LDA features."""
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(data)
#     lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
#     lda_features = lda.fit_transform(tfidf_matrix)
#     return lda_features


In [52]:
def lstm_embedding(data):
    word2vec_embeddings = extract_word2vec_embeddings(data)
    lda_features = extract_lda_features(data)
    contextual_embeddings = extract_contextual_embeddings(data)

    word2vec_embeddings = word2vec_embeddings.cpu().numpy()
    combined_features = np.hstack((word2vec_embeddings, lda_features, contextual_embeddings))
    return combined_features

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################

def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [53]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours}h {minutes}m {seconds}s"

In [54]:


class SimulatedAnnealingLRScheduler:
    def __init__(self, 
                 initial_lr=0.01, 
                 min_lr=1e-5, 
                 max_lr=1e-3, 
                 batch_num=100, 
                 temperature_init=1.0, 
                 cooling_rate=0.95):
        """
        Simulated Annealing Learning Rate Scheduler
        
        Parameters:
        - initial_lr: Starting learning rate
        - min_lr: Minimum learning rate
        - max_lr: Maximum learning rate
        - total_epochs: Total training epochs
        - temperature_init: Initial temperature
        - cooling_rate: Rate of temperature decrease
        """
        self.initial_lr = initial_lr
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.batch_num = batch_num
        self.temperature_init = temperature_init
        self.cooling_rate = cooling_rate
        
        # Dynamic learning rate tracking
        self.current_lr = initial_lr
    
    def get_lr(self, batch_num):
        """
        Compute learning rate based on simulated annealing
        
        Parameters:
        - batch_num: Current batch number
        
        Returns:
        - Adjusted learning rate
        """
        # Calculate current temperature
        current_temp = self.temperature_init * (self.cooling_rate ** batch_num)
        
        # Simulated annealing learning rate calculation
        # Smoothly transitions from exploration to exploitation
        new_lr = (
            self.max_lr * (1-math.exp(-current_temp)) + 
            self.min_lr * math.exp(-current_temp)
        )
        
        # Ensure learning rate stays within bounds
        new_lr = max(self.min_lr, min(new_lr, self.max_lr))
        
        self.current_lr = new_lr
        return new_lr


In [55]:
#LSTM model
class NER(nn.Module):
    def __init__(self, output_dim, hidden_size=512, embedding_dim=768,num_layers=1,dropout=0.1,bidirectional=True,batch_first=True,use_attention=False):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - output_dim: output dimension
        - hidden_size: the hidden size of the LSTM layer
        - num_layers: the number of LSTM layers
        - dropout: the dropout rate
        - bidirectional: whether to use bidirectional LSTM
        """
        super(NER, self).__init__()
        
        ## Word embedding layer        
        if bidirectional:
            hidden_size = hidden_size // 2
        # LSTM layer with combined embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size,num_layers=num_layers, 
                            batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.use_attention = use_attention
        if use_attention:
            if bidirectional:
                hidden_size *= 2
            self.attention = nn.ReLU()
        # Linear layer
        self.output_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, embeddings):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        # LSTM and linear layers
        if embeddings.dim() == 2:
            embeddings = embeddings.unsqueeze(1)
        lstm_out, _ = self.lstm(embeddings)
        # Apply attention if needed

        if self.use_attention:
            # Compute attention weights
            attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
            # Apply weighted sum
            context = torch.sum(lstm_out * attention_weights, dim=1)
        else:
            # Use the last output if no attention
            context = lstm_out[:, -1, :]

        final_output = self.output_layer(context)
        
        return final_output
  
def train(model, train_dataset,norm_clip = 0.1, batch_size=512, epochs=5, learning_rate=0.00005 , save_path="models_lstm/model_epoch_{}.pth"):
    """
    This function implements the training logic
    Inputs:
    - model: the model ot be trained
    - train_dataset: the training set of type NERDataset
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    ############################## TODO: replace the Nones in the following code ##################################
    
    # (1) create the dataloader of the training set (make the shuffle=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss()

    sa_scheduler = SimulatedAnnealingLRScheduler(
        initial_lr=0.1, 
        min_lr=1e-5, 
        max_lr=5e-2, 
        batch_num=len(train_dataloader) * epochs,
    )
    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=sa_scheduler.initial_lr)
    epoch_times = []
    batch_times = []

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        start_time = time.time()
        batches_prev = epoch_num * len(train_dataloader)
        batch_idx = 0
        for train_input, train_label in tqdm(train_dataloader):
            current_lr = sa_scheduler.get_lr(batches_prev + batch_idx)
        
                # Update optimizer learning rate
            for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr
            batch_start_time = time.time()

            # (4) move the train input to the device
            embeddings = lstm_embedding(train_input)
            embeddings_tensor = torch.tensor(embeddings).float()
            embeddings_tensor = embeddings_tensor.to(device)

            train_label = train_label.float()
            train_label = train_label.to(device)
            
            # (5) move the train label to the device

            # (6) do the forward pass
            output = model(embeddings_tensor)
            # output = output.permute(0, 2, 1) 

            batch_loss = criterion(output.reshape(-1), train_label.view(-1))

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)

            train_label = train_label.permute(1, 0) 
            acc = torch.sum(torch.argmax(output, dim=-1) == train_label)
            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), norm_clip) 
            # (12) update the weights with your optimizer
            optimizer.step()
            # Calculate batch time
            print(
            f'Epochs: {epoch_num + 1} | Train Loss: {batch_loss.item()} \
            | Train Accuracy: {acc/(train_dataset.y_tensor.size(1)*batch_size)}\n')
            print(f"Current Learning Rate: {current_lr:.6f}")

            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            # Calculate remaining time
            avg_batch_time = sum(batch_times) / len(batch_times)
            remaining_batches = (epochs - epoch_num - 1) * len(train_dataloader) + (len(train_dataloader) - batch_idx - 1)
            remaining_time = avg_batch_time * remaining_batches

            # Print batch metrics and remaining time
            print(f"Epoch {epoch_num+1}/{epochs}, Batch {batch_idx+1}/{len(train_dataloader)}")#, Loss: {loss.item()}, Accuracy: {(predicted == train_label).sum().item() / train_label.numel()}")
            print(f"Time for batch {batch_idx+1}: {batch_time:.2f} seconds")
            print(f"Estimated remaining time: {format_time(remaining_time)} seconds")
            batch_idx += 1
        ##############################################################################################################    
        # epoch loss
        epoch_loss = total_loss_train / len(train_dataset)
        epoch_acc = total_acc_train / (len(train_dataset) * train_dataset.y_tensor.size(1))
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')
        print(f"Current Learning Rate: {current_lr:.6f}")

        # (13) calculate the accuracy
        # Calculate epoch time
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

        # Calculate remaining time
        avg_epoch_time = sum(epoch_times) / len(epoch_times)
        remaining_time = avg_epoch_time * (epochs - (epoch_num + 1))

        # Print epoch metrics and remaining time
        print(f"Epoch {epoch_num+1}/{epochs}, Loss: {total_loss_train/len(train_dataloader)}")#, Accuracy: {total_acc_train/len(train_dataloader.dataset)}")
        print(f"Time for epoch {epoch_num+1}: {epoch_time:.2f} seconds")
        print(f"Estimated remaining time: {format_time(remaining_time)} seconds")

        save_model(model, epoch_num, save_path)


In [56]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [57]:
def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [58]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [59]:
def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [None]:
train_SRC,output_labels = extract_sentences()
train_SRC = train_SRC[:100000]
output_labels = output_labels[:10000]
print("train_SRC read ",train_SRC[0])
print("output_labels read ",output_labels[0])
print("train_SRC length ",len(train_SRC))
print("output_labels length ",len(output_labels))
print("checkpoint 0")

ut_labels = read_unique_labels('./unique_labels.txt')

t_labels = {}
t_labels['0'] = 0
for i in range(len(ut_labels)):
    t_labels[ut_labels[i]] = i+1

train_SRC_size = len(train_SRC)

longest_sentence = 25
print("checkpoint 1")
#tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]
for i in range(len(output_labels)):
    output_labels[i] = output_labels[i].split()
tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]

model = NER(longest_sentence, hidden_size=64, embedding_dim=870,num_layers=1,
            dropout=0,bidirectional=True,batch_first=True,use_attention=True)
train_dataset = NERDataset(train_SRC, tag_indices, longest_sentence)    
print("model created")
print("training starting")
print("checkpoint 2")
print("---------------------------------------------")
train(model, train_dataset, batch_size=1000, epochs=2, learning_rate=0.01,norm_clip=0.1)


train_SRC read  can i have a large bbq pulled pork
output_labels read  0 0 0 B-NUMBER B-SIZE B-TOPPING I-TOPPING I-TOPPING 
train_SRC length  10000
output_labels length  10000
checkpoint 0
checkpoint 1
model created
training starting
checkpoint 2
---------------------------------------------


  0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 1/10 [00:18<02:50, 18.95s/it]

Epochs: 1 | Train Loss: 655380.0             | Train Accuracy: 0.0013199999229982495

Current Learning Rate: 0.031610
Epoch 1/2, Batch 1/10
Time for batch 1: 18.94 seconds
Estimated remaining time: 0h 5m 59s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 2/10 [00:34<02:16, 17.04s/it]

Epochs: 1 | Train Loss: 622968.375             | Train Accuracy: 0.0

Current Learning Rate: 0.030667
Epoch 1/2, Batch 2/10
Time for batch 2: 15.70 seconds
Estimated remaining time: 0h 5m 11s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 30%|███       | 3/10 [00:50<01:55, 16.46s/it]

Epochs: 1 | Train Loss: 620950.6875             | Train Accuracy: 0.0

Current Learning Rate: 0.029726
Epoch 1/2, Batch 3/10
Time for batch 3: 15.76 seconds
Estimated remaining time: 0h 4m 45s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 40%|████      | 4/10 [01:06<01:37, 16.20s/it]

Epochs: 1 | Train Loss: 613708.6875             | Train Accuracy: 0.012279999442398548

Current Learning Rate: 0.028791
Epoch 1/2, Batch 4/10
Time for batch 4: 15.80 seconds
Estimated remaining time: 0h 4m 24s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 5/10 [01:21<01:18, 15.76s/it]

Epochs: 1 | Train Loss: 621105.6875             | Train Accuracy: 0.0139999995008111

Current Learning Rate: 0.027862
Epoch 1/2, Batch 5/10
Time for batch 5: 14.96 seconds
Estimated remaining time: 0h 4m 3s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 60%|██████    | 6/10 [01:36<01:02, 15.51s/it]

Epochs: 1 | Train Loss: 598466.875             | Train Accuracy: 0.012480000033974648

Current Learning Rate: 0.026941
Epoch 1/2, Batch 6/10
Time for batch 6: 15.03 seconds
Estimated remaining time: 0h 3m 44s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 70%|███████   | 7/10 [01:51<00:46, 15.47s/it]

Epochs: 1 | Train Loss: 594046.6875             | Train Accuracy: 0.012639999389648438

Current Learning Rate: 0.026032
Epoch 1/2, Batch 7/10
Time for batch 7: 15.39 seconds
Estimated remaining time: 0h 3m 27s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 80%|████████  | 8/10 [02:07<00:30, 15.50s/it]

Epochs: 1 | Train Loss: 593947.1875             | Train Accuracy: 0.01107999961823225

Current Learning Rate: 0.025134
Epoch 1/2, Batch 8/10
Time for batch 8: 15.54 seconds
Estimated remaining time: 0h 3m 10s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 90%|█████████ | 9/10 [02:22<00:15, 15.52s/it]

Epochs: 1 | Train Loss: 602694.625             | Train Accuracy: 0.006120000034570694

Current Learning Rate: 0.024251
Epoch 1/2, Batch 9/10
Time for batch 9: 15.56 seconds
Estimated remaining time: 0h 2m 54s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 10/10 [02:38<00:00, 15.80s/it]


Epochs: 1 | Train Loss: 603589.0             | Train Accuracy: 0.006679999642074108

Current Learning Rate: 0.023382
Epoch 1/2, Batch 10/10
Time for batch 10: 15.31 seconds
Estimated remaining time: 0h 2m 37s seconds
Epochs: 1 | Train Loss: 612.68578125             | Train Accuracy: 0.007660000119358301

Current Learning Rate: 0.023382
Epoch 1/2, Loss: 612685.78125
Time for epoch 1: 158.05 seconds
Estimated remaining time: 0h 2m 38s seconds
Model saved to models_lstm/model_epoch_0.pth


  0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 1/10 [00:15<02:21, 15.68s/it]

Epochs: 2 | Train Loss: 597155.125             | Train Accuracy: 0.012120000086724758

Current Learning Rate: 0.022530
Epoch 2/2, Batch 1/10
Time for batch 1: 15.67 seconds
Estimated remaining time: 0h 2m 22s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 2/10 [00:31<02:07, 15.99s/it]

Epochs: 2 | Train Loss: 618217.125             | Train Accuracy: 0.0116799995303154

Current Learning Rate: 0.021695
Epoch 2/2, Batch 2/10
Time for batch 2: 16.20 seconds
Estimated remaining time: 0h 2m 6s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 30%|███       | 3/10 [00:47<01:51, 15.92s/it]

Epochs: 2 | Train Loss: 591697.5625             | Train Accuracy: 0.010999999940395355

Current Learning Rate: 0.020879
Epoch 2/2, Batch 3/10
Time for batch 3: 15.82 seconds
Estimated remaining time: 0h 1m 50s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 40%|████      | 4/10 [01:04<01:36, 16.11s/it]

Epochs: 2 | Train Loss: 602491.4375             | Train Accuracy: 0.011039999313652515

Current Learning Rate: 0.020081
Epoch 2/2, Batch 4/10
Time for batch 4: 16.40 seconds
Estimated remaining time: 0h 1m 35s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 5/10 [01:22<01:24, 16.86s/it]

Epochs: 2 | Train Loss: 595035.0625             | Train Accuracy: 0.01152000017464161

Current Learning Rate: 0.019304
Epoch 2/2, Batch 5/10
Time for batch 5: 18.19 seconds
Estimated remaining time: 0h 1m 20s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 60%|██████    | 6/10 [01:37<01:05, 16.45s/it]

Epochs: 2 | Train Loss: 596258.625             | Train Accuracy: 0.013719999231398106

Current Learning Rate: 0.018546
Epoch 2/2, Batch 6/10
Time for batch 6: 15.64 seconds
Estimated remaining time: 0h 1m 3s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 70%|███████   | 7/10 [01:53<00:48, 16.31s/it]

Epochs: 2 | Train Loss: 587473.125             | Train Accuracy: 0.011719999834895134

Current Learning Rate: 0.017809
Epoch 2/2, Batch 7/10
Time for batch 7: 16.03 seconds
Estimated remaining time: 0h 0m 47s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
def evaluate(model, test_dataset,batch_size=512, epochs=5):
    """
    This function takes a NER model and evaluates its performance (accuracy) on a test data
    Inputs:
    - model: a NER model
    - test_dataset: dataset of type NERDataset
    """    
    # (1) create the dataloader of the training set (make the shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    epoch_times = []
    batch_times = []

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    total_acc_test = 0
  
    # (2) disable gradients
    with torch.no_grad():

        for test_input, test_label in tqdm(test_dataloader):
            # (3) move the test input to the device
            test_label = test_label.to(device)

            # (4) move the test label to the device
            test_input = test_input.to(device)

            # (5) do the forward pass
            output = model(test_input)

            # accuracy calculation (just add the correct predicted items to total_acc_test)
            acc =  torch.sum(torch.argmax(output, dim=-1) == test_label).item()

            total_acc_test += acc
            
            # (6) calculate the over all accuracy
            
            total_acc_test /= (len(test_dataset) * test_dataset.y_tensor.size(1))

    print(f'\nTest Accuracy: {total_acc_test}')

In [None]:
test_dataset = NERDataset()
evaluate(model, test_dataset)