In [9]:
from transformers import RobertaTokenizer
from transformers import RobertaTokenizer, RobertaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import normalize
import pandas as pd
from tqdm import tqdm
from torch import nn
import random as rnd
import torch
import ast
import numpy as np

In [10]:

def extract_sentences():

    # Extract src data
    source_data = []
    top_data = []
    top_decoupled_data = []

    # Read source.txt
    with open('../dataset/source.txt', 'r') as file:
        source_data = ast.literal_eval(file.read())

    # Read labels.txt
    with open("../dataset/input_labels.txt", 'r') as file:
        labels = file.read().splitlines()

    return source_data, labels

In [11]:
def read_unique_labels(file_path):
    labels = []
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

In [12]:
def extract_word2vec_embeddings(data, device='cuda'):
    """
    Extract Word2Vec embeddings
    """
    sentences = [sentence.split() for sentence in data]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Convert to GPU tensor if possible
    embeddings = []
    for sentence in sentences:
        # Get word vectors, use zeros if word not in vocabulary
        sent_embedding = [model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)]
        embedding = torch.tensor(np.mean(sent_embedding, axis=0), device=device)
        embeddings.append(embedding)
    
    return torch.stack(embeddings)

def extract_contextual_embeddings(data, device='cuda'):
    """
    Extract RoBERTa contextual embeddings
    """
    # Move model to GPU if available
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    
    embeddings = []
    for sentence in data:
        # Tokenize and move to GPU
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling and move back to CPU if needed
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(embedding.cpu().numpy())
    
    return np.array(embeddings)

def extract_lda_features(data, n_topics=2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda_features = lda.fit_transform(tfidf_matrix)
    return lda_features

In [13]:
# def extract_word2vec_embeddings(data):
#     sentences = [sentence.split() for sentence in data]
#     model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#     embeddings = [np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in sentences]
#     return np.array(embeddings)


# def extract_contextual_embeddings(data):
#     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#     model = RobertaModel.from_pretrained('roberta-base')
#     embeddings = []
#     for sentence in data:
#         inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
#     return np.array(embeddings)

# def extract_lda_features(data, n_topics=2):
#     """Extract LDA features."""
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(data)
#     lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
#     lda_features = lda.fit_transform(tfidf_matrix)
#     return lda_features


In [22]:
def lstm_embedding(data):
    word2vec_embeddings = extract_word2vec_embeddings(data)
    lda_features = extract_lda_features(data)
    contextual_embeddings = extract_contextual_embeddings(data)
    print(word2vec_embeddings.shape, lda_features.shape, contextual_embeddings.shape)
    print(type(word2vec_embeddings), type(lda_features), type(contextual_embeddings))
    word2vec_embeddings = word2vec_embeddings.cpu().numpy()
    combined_features = np.hstack((word2vec_embeddings, lda_features, contextual_embeddings))
    return combined_features


In [15]:
#LSTM model
class NER(nn.Module):
    def __init__(self, n_classes, hidden_size=50, embedding_dim=768):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (tags)
        """
        super(NER, self).__init__()
        ## Word embedding layer
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Combine word and contextual embeddings
        #combined_embedding_dim = embedding_dim + contextual_embedding_dim
        
        # LSTM layer with combined embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        
        # Linear layer
        self.linear = nn.Linear(hidden_size, n_classes)

    def forward(self, embeddings):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        # Word embeddings
        #word_embedded = self.embedding(sentences)

        # Ensure contextual embeddings have the same dimensions as word embeddings
        #contextual_embeddings = contextual_embeddings[:, :word_embedded.size(1), :]

        # Concatenate word and contextual embeddings
        #combined_embeddings = torch.cat([word_embedded, contextual_embeddings], dim=-1)
        
        # LSTM and linear layers
        lstm_out, _ = self.lstm(embeddings)
        final_output = self.linear(lstm_out)
        
        return final_output
  
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.00005):
    """
    This function implements the training logic
    Inputs:
    - model: the model ot be trained
    - train_dataset: the training set of type NERDataset
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    ############################## TODO: replace the Nones in the following code ##################################
    
    # (1) create the dataloader of the training set (make the shuffle=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            # (4) move the train input to the device
            embeddings = lstm_embedding(train_input)
            embeddings_tensor = torch.tensor(embeddings).float()
            embeddings_tensor = embeddings_tensor.to(device)

            train_label = train_label.float()
            train_label = train_label.to(device)
            
            # (5) move the train label to the device

            # (6) do the forward pass
            output = model(embeddings_tensor)
            # output = output.permute(0, 2, 1) 

            batch_loss = criterion(output.reshape(-1), train_label.view(-1))

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)

            train_label = train_label.permute(1, 0) 
            acc = torch.sum(torch.argmax(output, dim=-1) == train_label)
            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()

            # (12) update the weights with your optimizer
            optimizer.step()
        

        ##############################################################################################################    
            # epoch loss
            epoch_loss = total_loss_train / len(train_dataset)

            # (13) calculate the accuracy
            epoch_acc = total_acc_train / (len(train_dataset) * train_dataset.y_tensor.size(1))

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
                | Train Accuracy: {epoch_acc}\n')


In [16]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [23]:
train_SRC,output_labels = extract_sentences()
train_SRC = train_SRC[:52800]
output_labels = output_labels[:52800]
print("train_SRC read ",train_SRC[0])
print("output_labels read ",output_labels[0])
print("train_SRC length ",len(train_SRC))
print("output_labels length ",len(output_labels))
print("checkpoint 0")

ut_labels = read_unique_labels('./unique_labels.txt')

t_labels = {}
t_labels['0'] = 0
for i in range(len(ut_labels)):
    t_labels[ut_labels[i]] = i+1

train_SRC_size = len(train_SRC)

longest_sentence = 25
print("checkpoint 1")
#tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]
for i in range(len(output_labels)):
    output_labels[i] = output_labels[i].split()
tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]

model = NER(longest_sentence, hidden_size=50, embedding_dim=870)
train_dataset = NERDataset(train_SRC, tag_indices, longest_sentence)    
print("model created")
print("training starting")
print("checkpoint 2")
print("---------------------------------------------")
train(model, train_dataset, batch_size=256, epochs=10, learning_rate=0.00005)


train_SRC read  can i have a large bbq pulled pork
output_labels read  0 0 0 B-NUMBER B-SIZE B-TOPPING I-TOPPING I-TOPPING 
train_SRC length  52800
output_labels length  52800
checkpoint 0
checkpoint 1
model created
training starting
checkpoint 2
---------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/207 [00:04<16:30,  4.81s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 2.660028409090909                 | Train Accuracy: 3.7878789953538217e-06



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 2/207 [00:09<15:39,  4.58s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 5.387160866477273                 | Train Accuracy: 1.060606064129388e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|▏         | 3/207 [00:13<15:27,  4.55s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 8.125716145833334                 | Train Accuracy: 1.6666666851961054e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 4/207 [00:18<15:22,  4.54s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 10.958578361742424                 | Train Accuracy: 2.7272728402749635e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 5/207 [00:22<15:16,  4.54s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 13.747787642045454                 | Train Accuracy: 4.242424256517552e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 6/207 [00:27<15:18,  4.57s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 16.58297881155303                 | Train Accuracy: 5.075757871964015e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 7/207 [00:31<15:05,  4.53s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 19.340328776041666                 | Train Accuracy: 6.515151471830904e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▍         | 8/207 [00:36<15:02,  4.54s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 22.045736860795454                 | Train Accuracy: 7.803030166542158e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▍         | 9/207 [00:41<15:12,  4.61s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 24.743497277462122                 | Train Accuracy: 9.393939399160445e-05



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 10/207 [00:45<14:56,  4.55s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 27.543950639204546                 | Train Accuracy: 0.000115909097075928



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 11/207 [00:50<14:55,  4.57s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 30.278639618844696                 | Train Accuracy: 0.00013712121290154755



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▌         | 12/207 [00:54<15:01,  4.62s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 32.98505326704545                 | Train Accuracy: 0.00016363637405447662



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▋         | 13/207 [00:59<14:54,  4.61s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 35.78092151988636                 | Train Accuracy: 0.00020000000949949026



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  7%|▋         | 14/207 [01:04<15:16,  4.75s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 38.38720821496212                 | Train Accuracy: 0.00024772726465016603



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  7%|▋         | 15/207 [01:09<15:21,  4.80s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 41.14061937736742                 | Train Accuracy: 0.0002749999985098839



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  8%|▊         | 16/207 [01:14<15:23,  4.84s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 43.877684067234846                 | Train Accuracy: 0.00031287880847230554



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  8%|▊         | 17/207 [01:19<15:02,  4.75s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 46.571731178977274                 | Train Accuracy: 0.0003545454528648406



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  9%|▊         | 18/207 [01:23<15:07,  4.80s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 49.2872431344697                 | Train Accuracy: 0.0003863636520691216



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  9%|▉         | 19/207 [01:28<15:08,  4.83s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 52.076506865530305                 | Train Accuracy: 0.00042803032556548715



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|▉         | 20/207 [01:33<14:53,  4.78s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 54.808784327651516                 | Train Accuracy: 0.0004583333502523601



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 21/207 [01:37<14:29,  4.67s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 57.65274917140152                 | Train Accuracy: 0.0004962121602147818



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 11%|█         | 22/207 [01:42<14:29,  4.70s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 60.443660925662876                 | Train Accuracy: 0.0005318181938491762



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 11%|█         | 23/207 [01:47<14:30,  4.73s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 63.21889411695076                 | Train Accuracy: 0.0005696970038115978



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▏        | 24/207 [01:52<14:23,  4.72s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 65.90099076704546                 | Train Accuracy: 0.0006242424133233726



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▏        | 25/207 [01:56<14:14,  4.69s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 68.68206676136364                 | Train Accuracy: 0.0006621212232857943



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█▎        | 26/207 [02:01<14:15,  4.73s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 71.36861357717802                 | Train Accuracy: 0.0007007576059550047



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█▎        | 27/207 [02:06<14:05,  4.70s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 74.15041489109848                 | Train Accuracy: 0.000733333348762244



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▎        | 28/207 [02:10<13:42,  4.59s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 76.90238636363637                 | Train Accuracy: 0.0007750000222586095



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▍        | 29/207 [02:15<13:39,  4.60s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 79.57091382575757                 | Train Accuracy: 0.0008159091230481863



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▍        | 30/207 [02:19<13:23,  4.54s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 82.43123046875                 | Train Accuracy: 0.0008500000112690032



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 15%|█▍        | 31/207 [02:24<13:30,  4.61s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 85.11967507102273                 | Train Accuracy: 0.0008977272664196789



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 15%|█▌        | 32/207 [02:29<13:37,  4.67s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 87.7943388967803                 | Train Accuracy: 0.000940909085329622



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 16%|█▌        | 33/207 [02:33<13:35,  4.69s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 90.50483723958334                 | Train Accuracy: 0.000981060671620071



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 16%|█▋        | 34/207 [02:39<13:59,  4.85s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 93.14                 | Train Accuracy: 0.0010136363562196493



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 35/207 [02:43<13:43,  4.79s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 95.88441702178031                 | Train Accuracy: 0.0010522727388888597



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 36/207 [02:48<13:48,  4.85s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 98.6340234375                 | Train Accuracy: 0.0010954545577988029



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 18%|█▊        | 37/207 [02:53<13:38,  4.82s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 101.40762961647727                 | Train Accuracy: 0.0011257575824856758



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 18%|█▊        | 38/207 [02:58<13:23,  4.75s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 104.14966382575757                 | Train Accuracy: 0.0011696970323100686



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|█▉        | 39/207 [03:03<13:32,  4.84s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 106.81324041193182                 | Train Accuracy: 0.0012159091420471668



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|█▉        | 40/207 [03:08<13:33,  4.87s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 109.42995738636364                 | Train Accuracy: 0.0012560606701299548



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|█▉        | 41/207 [03:13<13:46,  4.98s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 112.032626953125                 | Train Accuracy: 0.0013007576344534755



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 42/207 [03:17<13:25,  4.88s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 114.74984404592803                 | Train Accuracy: 0.0013348484644666314



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 21%|██        | 43/207 [03:22<13:16,  4.86s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 117.35826319839015                 | Train Accuracy: 0.0013840909814462066



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 21%|██▏       | 44/207 [03:27<12:56,  4.76s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 120.15972153172349                 | Train Accuracy: 0.0014287879457697272



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 22%|██▏       | 45/207 [03:32<12:59,  4.81s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 122.83512517755682                 | Train Accuracy: 0.001466666697524488



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 22%|██▏       | 46/207 [03:36<12:47,  4.77s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 125.4859120501894                 | Train Accuracy: 0.001496212207712233



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 23%|██▎       | 47/207 [03:41<12:46,  4.79s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 128.19873490767046                 | Train Accuracy: 0.0015363636193796992



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 23%|██▎       | 48/207 [03:46<12:53,  4.87s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 130.94038026751895                 | Train Accuracy: 0.0015795454382896423



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 24%|██▎       | 49/207 [03:51<12:56,  4.91s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 133.67438624526514                 | Train Accuracy: 0.0016159091610461473



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 24%|██▍       | 50/207 [03:56<12:54,  4.94s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 136.46289654356062                 | Train Accuracy: 0.001653787912800908



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|██▍       | 51/207 [04:01<12:42,  4.89s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 139.20079752604167                 | Train Accuracy: 0.0016924242954701185



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|██▌       | 52/207 [04:06<12:37,  4.89s/it]

torch.Size([256, 100]) (256, 2) (256, 768)
<class 'torch.Tensor'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Epochs: 1 | Train Loss: 141.85047378077653                 | Train Accuracy: 0.0017325758235529065



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|██▌       | 52/207 [04:10<12:27,  4.82s/it]


KeyboardInterrupt: 