In [1]:
from transformers import RobertaTokenizer
from transformers import RobertaTokenizer, RobertaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import normalize
import pandas as pd
from tqdm import tqdm
from torch import nn
import random as rnd
import torch
import ast
import numpy as np

In [2]:

def extract_sentences():

    # Extract src data
    source_data = []
    top_data = []
    top_decoupled_data = []

    # Read source.txt
    with open('../dataset/source.txt', 'r') as file:
        source_data = ast.literal_eval(file.read())

    # Read labels.txt
    with open("../dataset/input_labels.txt", 'r') as file:
        labels = file.read().splitlines()

    return source_data, labels

In [3]:
def read_unique_labels(file_path):
    labels = []
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

In [4]:
def extract_word2vec_embeddings(data, device='cuda'):
    """
    Extract Word2Vec embeddings
    """
    sentences = [sentence.split() for sentence in data]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Convert to GPU tensor if possible
    embeddings = []
    for sentence in sentences:
        # Get word vectors, use zeros if word not in vocabulary
        sent_embedding = [model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)]
        embedding = torch.tensor(np.mean(sent_embedding, axis=0), device=device)
        embeddings.append(embedding)
    
    return torch.stack(embeddings)

def extract_contextual_embeddings(data, device='cuda'):
    """
    Extract RoBERTa contextual embeddings
    """
    # Move model to GPU if available
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    
    embeddings = []
    for sentence in data:
        # Tokenize and move to GPU
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling and move back to CPU if needed
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(embedding.cpu().numpy())
    
    return np.array(embeddings)

def extract_lda_features(data, n_topics=2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda_features = lda.fit_transform(tfidf_matrix)
    return lda_features

In [5]:
# def extract_word2vec_embeddings(data):
#     sentences = [sentence.split() for sentence in data]
#     model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#     embeddings = [np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in sentences]
#     return np.array(embeddings)


# def extract_contextual_embeddings(data):
#     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#     model = RobertaModel.from_pretrained('roberta-base')
#     embeddings = []
#     for sentence in data:
#         inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
#     return np.array(embeddings)

# def extract_lda_features(data, n_topics=2):
#     """Extract LDA features."""
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(data)
#     lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
#     lda_features = lda.fit_transform(tfidf_matrix)
#     return lda_features


In [6]:
def lstm_embedding(data):
    word2vec_embeddings = extract_word2vec_embeddings(data)
    lda_features = extract_lda_features(data)
    contextual_embeddings = extract_contextual_embeddings(data)

    word2vec_embeddings = word2vec_embeddings.cpu().numpy()
    combined_features = np.hstack((word2vec_embeddings, lda_features, contextual_embeddings))
    return combined_features


In [7]:
#LSTM model
class NER(nn.Module):
    def __init__(self, n_classes, hidden_size=50, embedding_dim=768):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (tags)
        """
        super(NER, self).__init__()
        ## Word embedding layer
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Combine word and contextual embeddings
        #combined_embedding_dim = embedding_dim + contextual_embedding_dim
        
        # LSTM layer with combined embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        
        # Linear layer
        self.linear = nn.Linear(hidden_size, n_classes)

    def forward(self, embeddings):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        # Word embeddings
        #word_embedded = self.embedding(sentences)

        # Ensure contextual embeddings have the same dimensions as word embeddings
        #contextual_embeddings = contextual_embeddings[:, :word_embedded.size(1), :]

        # Concatenate word and contextual embeddings
        #combined_embeddings = torch.cat([word_embedded, contextual_embeddings], dim=-1)
        
        # LSTM and linear layers
        lstm_out, _ = self.lstm(embeddings)
        final_output = self.linear(lstm_out)
        
        return final_output
  
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.00005 , save_path="models_lstm/model_epoch_{}.pth"):
    """
    This function implements the training logic
    Inputs:
    - model: the model ot be trained
    - train_dataset: the training set of type NERDataset
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    ############################## TODO: replace the Nones in the following code ##################################
    
    # (1) create the dataloader of the training set (make the shuffle=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            # (4) move the train input to the device
            embeddings = lstm_embedding(train_input)
            embeddings_tensor = torch.tensor(embeddings).float()
            embeddings_tensor = embeddings_tensor.to(device)

            train_label = train_label.float()
            train_label = train_label.to(device)
            
            # (5) move the train label to the device

            # (6) do the forward pass
            output = model(embeddings_tensor)
            # output = output.permute(0, 2, 1) 

            batch_loss = criterion(output.reshape(-1), train_label.view(-1))

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)

            train_label = train_label.permute(1, 0) 
            acc = torch.sum(torch.argmax(output, dim=-1) == train_label)
            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()

            # (12) update the weights with your optimizer
            optimizer.step()
        

        ##############################################################################################################    
            # epoch loss
            epoch_loss = total_loss_train / len(train_dataset)

            # (13) calculate the accuracy
            epoch_acc = total_acc_train / (len(train_dataset) * train_dataset.y_tensor.size(1))

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
                | Train Accuracy: {epoch_acc}\n')
            save_model(model, epoch_num, save_path)


In [8]:

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = x
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################


In [9]:
def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [None]:
train_SRC,output_labels = extract_sentences()
train_SRC = train_SRC[:1024]
output_labels = output_labels[:1024]
print("train_SRC read ",train_SRC[0])
print("output_labels read ",output_labels[0])
print("train_SRC length ",len(train_SRC))
print("output_labels length ",len(output_labels))
print("checkpoint 0")

ut_labels = read_unique_labels('./unique_labels.txt')

t_labels = {}
t_labels['0'] = 0
for i in range(len(ut_labels)):
    t_labels[ut_labels[i]] = i+1

train_SRC_size = len(train_SRC)

longest_sentence = 25
print("checkpoint 1")
#tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]
for i in range(len(output_labels)):
    output_labels[i] = output_labels[i].split()
tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]

model = NER(longest_sentence, hidden_size=50, embedding_dim=870)
train_dataset = NERDataset(train_SRC, tag_indices, longest_sentence)    
print("model created")
print("training starting")
print("checkpoint 2")
print("---------------------------------------------")
train(model, train_dataset, batch_size=1024, epochs=10, learning_rate=0.005)


train_SRC read  can i have a large bbq pulled pork
output_labels read  0 0 0 B-NUMBER B-SIZE B-TOPPING I-TOPPING I-TOPPING 
train_SRC length  1024
output_labels length  1024
checkpoint 0
checkpoint 1
model created
training starting
checkpoint 2
---------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:15<00:00, 15.25s/it]


Epochs: 1 | Train Loss: 658.3058471679688                 | Train Accuracy: 0.0

Model saved to models_lstm/model_epoch_0.pth


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:15<00:00, 15.51s/it]


Epochs: 2 | Train Loss: 646.1591186523438                 | Train Accuracy: 0.008242187090218067

Model saved to models_lstm/model_epoch_1.pth


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:15<00:00, 15.22s/it]


Epochs: 3 | Train Loss: 637.9321899414062                 | Train Accuracy: 0.014140624552965164

Model saved to models_lstm/model_epoch_2.pth


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:15<00:00, 15.39s/it]


Epochs: 4 | Train Loss: 632.2236938476562                 | Train Accuracy: 0.014140624552965164

Model saved to models_lstm/model_epoch_3.pth


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:16<00:00, 16.30s/it]


Epochs: 5 | Train Loss: 628.084716796875                 | Train Accuracy: 0.014140624552965164

Model saved to models_lstm/model_epoch_4.pth


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:16<00:00, 16.37s/it]


Epochs: 6 | Train Loss: 625.2139892578125                 | Train Accuracy: 0.014140624552965164

Model saved to models_lstm/model_epoch_5.pth


  0%|          | 0/1 [00:00<?, ?it/s]