In [127]:
from transformers import RobertaTokenizer
from transformers import RobertaTokenizer, RobertaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import time

from sklearn.preprocessing import normalize
import pandas as pd
from tqdm import tqdm
from torch import nn
import random as rnd
import torch
import ast
import numpy as np

In [128]:

def extract_sentences():

    # Extract src data
    source_data = []
    top_data = []
    top_decoupled_data = []

    # Read source.txt
    with open('../dataset/source.txt', 'r') as file:
        source_data = ast.literal_eval(file.read())

    # Read labels.txt
    with open("../dataset/input_labels.txt", 'r') as file:
        labels = file.read().splitlines()

    return source_data, labels

In [129]:
def read_unique_labels(file_path):
    labels = []
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

In [130]:
def extract_word2vec_embeddings(data, device='cuda'):
    """
    Extract Word2Vec embeddings
    """
    sentences = [sentence.split() for sentence in data]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Convert to GPU tensor if possible
    embeddings = []
    for sentence in sentences:
        # Get word vectors, use zeros if word not in vocabulary
        sent_embedding = [model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)]
        embedding = torch.tensor(np.mean(sent_embedding, axis=0), device=device)
        embeddings.append(embedding)
    
    return torch.stack(embeddings)

def extract_contextual_embeddings(data, device='cuda'):
    """
    Extract RoBERTa contextual embeddings
    """
    # Move model to GPU if available
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    
    embeddings = []
    for sentence in data:
        # Tokenize and move to GPU
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling and move back to CPU if needed
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(embedding.cpu().numpy())
    
    return np.array(embeddings)

def extract_lda_features(data, n_topics=2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda_features = lda.fit_transform(tfidf_matrix)
    return lda_features

In [131]:
# def extract_word2vec_embeddings(data):
#     sentences = [sentence.split() for sentence in data]
#     model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#     embeddings = [np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in sentences]
#     return np.array(embeddings)


# def extract_contextual_embeddings(data):
#     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#     model = RobertaModel.from_pretrained('roberta-base')
#     embeddings = []
#     for sentence in data:
#         inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
#     return np.array(embeddings)

# def extract_lda_features(data, n_topics=2):
#     """Extract LDA features."""
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(data)
#     lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
#     lda_features = lda.fit_transform(tfidf_matrix)
#     return lda_features


In [132]:
def lstm_embedding(data):
    word2vec_embeddings = extract_word2vec_embeddings(data)
    lda_features = extract_lda_features(data)
    contextual_embeddings = extract_contextual_embeddings(data)

    word2vec_embeddings = word2vec_embeddings.cpu().numpy()
    combined_features = np.hstack((word2vec_embeddings, lda_features, contextual_embeddings))
    return combined_features

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################

def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [133]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours}h {minutes}m {seconds}s"

In [134]:
#LSTM model
class NER(nn.Module):
    def __init__(self, output_dim, hidden_size=512, embedding_dim=768,num_layers=1,dropout=0.1,bidirectional=True,batch_first=True,use_attention=False):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - output_dim: output dimension
        - hidden_size: the hidden size of the LSTM layer
        - num_layers: the number of LSTM layers
        - dropout: the dropout rate
        - bidirectional: whether to use bidirectional LSTM
        """
        super(NER, self).__init__()
        ## Word embedding layer        
        if bidirectional:
            hidden_size = hidden_size // 2
        # LSTM layer with combined embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size,num_layers=num_layers, 
                            batch_first=True, dropout=dropout, bidirectional=bidirectional)
        self.use_attention = use_attention
        if use_attention:
            if bidirectional:
                hidden_size *= 2
            self.attention = nn.ReLU()
        # Linear layer
        self.output_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, embeddings):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        # LSTM and linear layers
        if embeddings.dim() == 2:
            embeddings = embeddings.unsqueeze(1)
        lstm_out, _ = self.lstm(embeddings)
        # Apply attention if needed

        if self.use_attention:
            # Compute attention weights
            attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
            # Apply weighted sum
            context = torch.sum(lstm_out * attention_weights, dim=1)
        else:
            # Use the last output if no attention
            context = lstm_out[:, -1, :]

        final_output = self.output_layer(context)
        
        return final_output
  
def train(model, train_dataset,norm_clip = 0.1, batch_size=512, epochs=5, learning_rate=0.00005 , save_path="models_lstm/model_epoch_{}.pth"):
    """
    This function implements the training logic
    Inputs:
    - model: the model ot be trained
    - train_dataset: the training set of type NERDataset
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    ############################## TODO: replace the Nones in the following code ##################################
    
    # (1) create the dataloader of the training set (make the shuffle=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    epoch_times = []
    batch_times = []

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        start_time = time.time()
        batch_idx = 0
        for train_input, train_label in tqdm(train_dataloader):
            batch_start_time = time.time()

            # (4) move the train input to the device
            embeddings = lstm_embedding(train_input)
            embeddings_tensor = torch.tensor(embeddings).float()
            embeddings_tensor = embeddings_tensor.to(device)

            train_label = train_label.float()
            train_label = train_label.to(device)
            
            # (5) move the train label to the device

            # (6) do the forward pass
            output = model(embeddings_tensor)
            # output = output.permute(0, 2, 1) 

            batch_loss = criterion(output.reshape(-1), train_label.view(-1))

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)

            train_label = train_label.permute(1, 0) 
            acc = torch.sum(torch.argmax(output, dim=-1) == train_label)
            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), norm_clip) 
            # (12) update the weights with your optimizer
            optimizer.step()
            # Calculate batch time
        ##############################################################################################################    
            # epoch loss
            epoch_loss = total_loss_train / len(train_dataset)
            epoch_acc = total_acc_train / (len(train_dataset) * train_dataset.y_tensor.size(1))
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
                | Train Accuracy: {epoch_acc}\n')
            
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            # Calculate remaining time
            avg_batch_time = sum(batch_times) / len(batch_times)
            remaining_batches = (epochs - epoch_num - 1) * len(train_dataloader) + (len(train_dataloader) - batch_idx - 1)
            remaining_time = avg_batch_time * remaining_batches

            # Print batch metrics and remaining time
            print(f"Epoch {epoch_num+1}/{epochs}, Batch {batch_idx+1}/{len(train_dataloader)}")#, Loss: {loss.item()}, Accuracy: {(predicted == train_label).sum().item() / train_label.numel()}")
            print(f"Time for batch {batch_idx+1}: {batch_time:.2f} seconds")
            print(f"Estimated remaining time: {format_time(remaining_time)} seconds")
            batch_idx += 1

        # (13) calculate the accuracy
        # Calculate epoch time
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

        # Calculate remaining time
        avg_epoch_time = sum(epoch_times) / len(epoch_times)
        remaining_time = avg_epoch_time * (epochs - (epoch_num + 1))

        # Print epoch metrics and remaining time
        print(f"Epoch {epoch_num+1}/{epochs}, Loss: {total_loss_train/len(train_dataloader)}")#, Accuracy: {total_acc_train/len(train_dataloader.dataset)}")
        print(f"Time for epoch {epoch_num+1}: {epoch_time:.2f} seconds")
        print(f"Estimated remaining time: {format_time(remaining_time)} seconds")

        save_model(model, epoch_num, save_path)


In [135]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [136]:
def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [137]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [138]:
def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [139]:
train_SRC,output_labels = extract_sentences()
train_SRC = train_SRC[:10000]
output_labels = output_labels[:10000]
print("train_SRC read ",train_SRC[0])
print("output_labels read ",output_labels[0])
print("train_SRC length ",len(train_SRC))
print("output_labels length ",len(output_labels))
print("checkpoint 0")

ut_labels = read_unique_labels('./unique_labels.txt')

t_labels = {}
t_labels['0'] = 0
for i in range(len(ut_labels)):
    t_labels[ut_labels[i]] = i+1

train_SRC_size = len(train_SRC)

longest_sentence = 25
print("checkpoint 1")
#tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]
for i in range(len(output_labels)):
    output_labels[i] = output_labels[i].split()
tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]

model = NER(longest_sentence, hidden_size=512, embedding_dim=870,num_layers=1,
            dropout=0,bidirectional=True,batch_first=True,use_attention=True)
train_dataset = NERDataset(train_SRC, tag_indices, longest_sentence)    
print("model created")
print("training starting")
print("checkpoint 2")
print("---------------------------------------------")
train(model, train_dataset, batch_size=1000, epochs=2, learning_rate=0.0001,norm_clip=0.1)


train_SRC read  can i have a large bbq pulled pork
output_labels read  0 0 0 B-NUMBER B-SIZE B-TOPPING I-TOPPING I-TOPPING 
train_SRC length  10000
output_labels length  10000
checkpoint 0
checkpoint 1
model created
training starting
checkpoint 2
---------------------------------------------


  0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 1/10 [00:14<02:14, 14.97s/it]

Epochs: 1 | Train Loss: 65.0488125                 | Train Accuracy: 0.0030640000477433205

Epoch 1/2, Batch 1/10
Time for batch 1: 14.96 seconds
Estimated remaining time: 0h 4m 44s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 2/10 [00:30<02:02, 15.37s/it]

Epochs: 1 | Train Loss: 130.4740125                 | Train Accuracy: 0.006060000043362379

Epoch 1/2, Batch 2/10
Time for batch 2: 15.65 seconds
Estimated remaining time: 0h 4m 35s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 30%|███       | 3/10 [00:45<01:46, 15.18s/it]

Epochs: 1 | Train Loss: 195.1178875                 | Train Accuracy: 0.00910400040447712

Epoch 1/2, Batch 3/10
Time for batch 3: 14.94 seconds
Estimated remaining time: 0h 4m 18s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 40%|████      | 4/10 [01:01<01:32, 15.48s/it]

Epochs: 1 | Train Loss: 259.05925625                 | Train Accuracy: 0.011788000352680683

Epoch 1/2, Batch 4/10
Time for batch 4: 15.94 seconds
Estimated remaining time: 0h 4m 5s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 5/10 [01:16<01:16, 15.31s/it]

Epochs: 1 | Train Loss: 324.6958                 | Train Accuracy: 0.013240000233054161

Epoch 1/2, Batch 5/10
Time for batch 5: 15.02 seconds
Estimated remaining time: 0h 3m 49s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 60%|██████    | 6/10 [01:31<01:00, 15.17s/it]

Epochs: 1 | Train Loss: 389.23009375                 | Train Accuracy: 0.01450399961322546

Epoch 1/2, Batch 6/10
Time for batch 6: 14.89 seconds
Estimated remaining time: 0h 3m 33s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 70%|███████   | 7/10 [01:46<00:45, 15.13s/it]

Epochs: 1 | Train Loss: 454.115875                 | Train Accuracy: 0.01582000032067299

Epoch 1/2, Batch 7/10
Time for batch 7: 15.04 seconds
Estimated remaining time: 0h 3m 17s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 80%|████████  | 8/10 [02:01<00:30, 15.26s/it]

Epochs: 1 | Train Loss: 518.07229375                 | Train Accuracy: 0.017155999317765236

Epoch 1/2, Batch 8/10
Time for batch 8: 15.52 seconds
Estimated remaining time: 0h 3m 2s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 90%|█████████ | 9/10 [02:17<00:15, 15.26s/it]

Epochs: 1 | Train Loss: 582.36615625                 | Train Accuracy: 0.01852400042116642

Epoch 1/2, Batch 9/10
Time for batch 9: 15.25 seconds
Estimated remaining time: 0h 2m 47s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 10/10 [02:31<00:00, 15.19s/it]


Epochs: 1 | Train Loss: 646.77773125                 | Train Accuracy: 0.0197759997099638

Epoch 1/2, Batch 10/10
Time for batch 10: 14.60 seconds
Estimated remaining time: 0h 2m 31s seconds
Epoch 1/2, Loss: 646777.73125
Time for epoch 1: 151.86 seconds
Estimated remaining time: 0h 2m 31s seconds
Model saved to models_lstm/model_epoch_0.pth


  0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 1/10 [00:14<02:08, 14.32s/it]

Epochs: 2 | Train Loss: 65.5512875                 | Train Accuracy: 0.0012440000427886844

Epoch 2/2, Batch 1/10
Time for batch 1: 14.31 seconds
Estimated remaining time: 0h 2m 15s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 2/10 [00:28<01:56, 14.51s/it]

Epochs: 2 | Train Loss: 129.5726625                 | Train Accuracy: 0.0025239998940378428

Epoch 2/2, Batch 2/10
Time for batch 2: 14.63 seconds
Estimated remaining time: 0h 2m 0s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 30%|███       | 3/10 [00:43<01:41, 14.50s/it]

Epochs: 2 | Train Loss: 193.57470625                 | Train Accuracy: 0.003768000053241849

Epoch 2/2, Batch 3/10
Time for batch 3: 14.48 seconds
Estimated remaining time: 0h 1m 45s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 40%|████      | 4/10 [00:58<01:27, 14.55s/it]

Epochs: 2 | Train Loss: 257.71905                 | Train Accuracy: 0.004999999888241291

Epoch 2/2, Batch 4/10
Time for batch 4: 14.64 seconds
Estimated remaining time: 0h 1m 29s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 5/10 [01:12<01:13, 14.62s/it]

Epochs: 2 | Train Loss: 320.4324875                 | Train Accuracy: 0.006335999816656113

Epoch 2/2, Batch 5/10
Time for batch 5: 14.74 seconds
Estimated remaining time: 0h 1m 14s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 60%|██████    | 6/10 [01:27<00:58, 14.60s/it]

Epochs: 2 | Train Loss: 384.85209375                 | Train Accuracy: 0.007672000210732222

Epoch 2/2, Batch 6/10
Time for batch 6: 14.56 seconds
Estimated remaining time: 0h 0m 59s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 70%|███████   | 7/10 [01:42<00:43, 14.65s/it]

Epochs: 2 | Train Loss: 448.540425                 | Train Accuracy: 0.00897200033068657

Epoch 2/2, Batch 7/10
Time for batch 7: 14.73 seconds
Estimated remaining time: 0h 0m 44s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 80%|████████  | 8/10 [01:56<00:29, 14.54s/it]

Epochs: 2 | Train Loss: 513.4440625                 | Train Accuracy: 0.010255999863147736

Epoch 2/2, Batch 8/10
Time for batch 8: 14.30 seconds
Estimated remaining time: 0h 0m 29s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 90%|█████████ | 9/10 [02:11<00:14, 14.61s/it]

Epochs: 2 | Train Loss: 577.63285                 | Train Accuracy: 0.011603999882936478

Epoch 2/2, Batch 9/10
Time for batch 9: 14.76 seconds
Estimated remaining time: 0h 0m 14s seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 10/10 [02:25<00:00, 14.57s/it]

Epochs: 2 | Train Loss: 641.121425                 | Train Accuracy: 0.012864000163972378

Epoch 2/2, Batch 10/10
Time for batch 10: 14.48 seconds
Estimated remaining time: 0h 0m 0s seconds
Epoch 2/2, Loss: 641121.425
Time for epoch 2: 145.69 seconds
Estimated remaining time: 0h 0m 0s seconds
Model saved to models_lstm/model_epoch_1.pth



