In [1]:
from transformers import RobertaTokenizer
from transformers import RobertaTokenizer, RobertaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from IPython.display import clear_output

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

from sklearn.preprocessing import normalize
import pandas as pd
from tqdm import tqdm
from torch import nn
import random as rnd
import torch
import ast
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def extract_sentences():

    # Extract src data
    source_data = []
    top_data = []
    top_decoupled_data = []

    # Read source.txt
    with open('../../dataset/source.txt', 'r') as file:
        source_data = ast.literal_eval(file.read())

    # Read labels.txt
    with open("../../dataset/input_labels.txt", 'r') as file:
        labels = file.read().splitlines()

    return source_data, labels

In [3]:
def read_unique_labels(file_path):
    labels = []
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

In [4]:
def extract_word2vec_embeddings(data, device='cuda'):
    """
    Extract Word2Vec embeddings
    """
    sentences = [sentence.split() for sentence in data]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Convert to GPU tensor if possible
    embeddings = []
    for sentence in sentences:
        # Get word vectors, use zeros if word not in vocabulary
        sent_embedding = [model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)]
        embedding = torch.tensor(np.mean(sent_embedding, axis=0), device=device)
        embeddings.append(embedding)
    
    return torch.stack(embeddings)

def extract_contextual_embeddings(data, device='cuda'):
    """
    Extract RoBERTa contextual embeddings
    """
    # Move model to GPU if available
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    
    embeddings = []
    for sentence in data:
        # Tokenize and move to GPU
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling and move back to CPU if needed
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(embedding.cpu().numpy())
    
    return np.array(embeddings)

def extract_lda_features(data, n_topics=2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda_features = lda.fit_transform(tfidf_matrix)
    return lda_features

In [5]:
def lstm_embedding(data):
    word2vec_embeddings = extract_word2vec_embeddings(data)
    #print(word2vec_embeddings)
    print(word2vec_embeddings.shape)
    # lda_features = extract_lda_features(data)
    # contextual_embeddings = extract_contextual_embeddings(data)

    # word2vec_embeddings = word2vec_embeddings.cpu().numpy()
    # combined_features = np.hstack((word2vec_embeddings, lda_features, contextual_embeddings))
    return word2vec_embeddings

class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, max_len):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # i guess x should be extended to have the same length as y
    self.x_tensor = [seq + "<pad>" * (max_len - len(seq)) for seq in x]
    self.y_tensor = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in y], dtype=torch.long)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################

    return len(self.x_tensor)
  
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x_tensor[idx], self.y_tensor[idx]
    ##########################################################################################

def save_model(model, epoch, path="model_epoch_{}.pth"):
    torch.save(model.state_dict(), path.format(epoch))
    print(f"Model saved to {path.format(epoch)}")

In [6]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours}h {minutes}m {seconds}s"

In [7]:
class SimulatedAnnealingLRScheduler:
    def __init__(self, 
                 initial_lr=0.01, 
                 min_lr=1e-5, 
                 max_lr=1e-3, 
                 batch_num=100, 
                 temperature_init=1.0, 
                 cooling_rate=0.95):
        """
        Simulated Annealing Learning Rate Scheduler
        
        Parameters:
        - initial_lr: Starting learning rate
        - min_lr: Minimum learning rate
        - max_lr: Maximum learning rate
        - total_epochs: Total training epochs
        - temperature_init: Initial temperature
        - cooling_rate: Rate of temperature decrease
        """
        self.initial_lr = initial_lr
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.batch_num = batch_num
        self.temperature_init = temperature_init
        self.cooling_rate = cooling_rate
        
        # Dynamic learning rate tracking
        self.current_lr = initial_lr
    
    def get_lr(self, batch_num):
        """
        Compute learning rate based on simulated annealing
        
        Parameters:
        - batch_num: Current batch number
        
        Returns:
        - Adjusted learning rate
        """
        # Calculate current temperature
        current_temp = self.temperature_init * (self.cooling_rate ** batch_num)
        
        # Simulated annealing learning rate calculation
        # Smoothly transitions from exploration to exploitation
        new_lr = (
            self.max_lr * (1-math.exp(-current_temp)) + 
            self.min_lr * math.exp(-current_temp)
        )
        
        # Ensure learning rate stays within bounds
        new_lr = max(self.min_lr, min(new_lr, self.max_lr))
        
        self.current_lr = new_lr
        return new_lr


In [8]:
#LSTM model
class NER(nn.Module):
    def __init__(self, output_dim, hidden_size=512, embedding_dim=768,num_layers=1,dropout=0.1,bidirectional=True,batch_first=True,use_attention=False):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - output_dim: output dimension
        - hidden_size: the hidden size of the LSTM layer
        - num_layers: the number of LSTM layers
        - dropout: the dropout rate
        - bidirectional: whether to use bidirectional LSTM
        """
        super(NER, self).__init__()
        
        ## Word embedding layer        
        if bidirectional:
            hidden_size = hidden_size // 2
        # LSTM layer with combined embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size,num_layers=num_layers, 
                            batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.use_attention = use_attention
        if use_attention:
            if bidirectional:
                hidden_size *= 2
            self.attention = nn.ReLU()
        # Linear layer
        self.output_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, embeddings):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        # LSTM and linear layers
        if embeddings.dim() == 2:
            embeddings = embeddings.unsqueeze(1)
        lstm_out, _ = self.lstm(embeddings)
        # Apply attention if needed

        if self.use_attention:
            # Compute attention weights
            attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
            # Apply weighted sum
            context = torch.sum(lstm_out * attention_weights, dim=1)
        else:
            # Use the last output if no attention
            context = lstm_out[:, -1, :]

        final_output = self.output_layer(context)
        
        return final_output
  
def train(model, train_dataset,norm_clip = 0.1, batch_size=512, epochs=5, learning_rate=0.00005 , save_path="models_lstm2/model_epoch_{}.pth", log_file="logs_lstm.txt"):
    """
    This function implements the training logic
    Inputs:
    - model: the model ot be trained
    - train_dataset: the training set of type NERDataset
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    ############################## TODO: replace the Nones in the following code ##################################
    
    # (1) create the dataloader of the training set (make the shuffle=True)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss()

    sa_scheduler = SimulatedAnnealingLRScheduler(
        initial_lr=0.5, 
        min_lr=5e-4, 
        max_lr=5e-2, 
        batch_num=len(train_dataloader) * epochs,
    )
    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=sa_scheduler.initial_lr)
    epoch_times = []
    batch_times = []

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print("CUDA is not available. Training on CPU ...")
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        start_time = time.time()
        batches_prev = epoch_num * len(train_dataloader)
        batch_idx = 0
        for train_input, train_label in tqdm(train_dataloader):
            current_lr = sa_scheduler.get_lr(batches_prev + batch_idx)
        
                # Update optimizer learning rate
            for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr
            batch_start_time = time.time()

            # (4) move the train input to the device
            #embeddings_tensor = embeddings_tensor.to(device)
            
            embeddings = lstm_embedding(train_input)
            embeddings_tensor = embeddings.to(device)
            #embeddings_tensor = torch.tensor(embeddings).float()
            train_label = train_label.float()
            train_label = train_label.to(device)
            # (5) move the train label to the device

            # (6) do the forward pass
            output = model(embeddings_tensor)
            # output = output.permute(0, 2, 1) 

            batch_loss = criterion(output.reshape(-1), train_label.view(-1))

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)

            train_label = train_label.permute(1, 0) 
            acc = torch.sum(torch.argmax(output, dim=-1) == train_label)
            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), norm_clip) 
            # (12) update the weights with your optimizer
            optimizer.step()
            # Calculate batch time
            clear_output(wait=True)

            print(
            f'Epochs: {epoch_num + 1} | Train Loss: {batch_loss.item()} \
            | Train Accuracy: {acc/(train_dataset.y_tensor.size(1)*batch_size)}\n')
            print(f"Current Learning Rate: {current_lr:.6f}")

            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            # Calculate remaining time
            avg_batch_time = sum(batch_times) / len(batch_times)
            remaining_batches = (epochs - epoch_num - 1) * len(train_dataloader) + (len(train_dataloader) - batch_idx - 1)
            remaining_time = avg_batch_time * remaining_batches

            # Print batch metrics and remaining time
            print(f"Epoch {epoch_num+1}/{epochs}, Batch {batch_idx+1}/{len(train_dataloader)}")#, Loss: {loss.item()}, Accuracy: {(predicted == train_label).sum().item() / train_label.numel()}")
            print(f"Time for batch {batch_idx+1}: {batch_time:.2f} seconds")
            print(f"Estimated remaining time: {format_time(remaining_time)} seconds")
            batch_idx += 1
        ##############################################################################################################    
        # epoch loss
        epoch_loss = total_loss_train / len(train_dataset)
        epoch_acc = total_acc_train / (len(train_dataset) * train_dataset.y_tensor.size(1))
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')
        print(f"Current Learning Rate: {current_lr:.6f}")

        # (13) calculate the accuracy
        # Calculate epoch time
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

        # Calculate remaining time
        avg_epoch_time = sum(epoch_times) / len(epoch_times)
        remaining_time = avg_epoch_time * (epochs - (epoch_num + 1))

        # Print epoch metrics and remaining time
        print(f"Epoch {epoch_num+1}/{epochs}, Loss: {total_loss_train/len(train_dataloader)}")#, Accuracy: {total_acc_train/len(train_dataloader.dataset)}")
        print(f"Time for epoch {epoch_num+1}: {epoch_time:.2f} seconds")
        print(f"Estimated remaining time: {format_time(remaining_time)} seconds")
        
        # Log epoch metrics to a file
        with open(log_file, "a") as f:
            f.write(f"Epoch {epoch_num+1}/{epochs}, Loss: {epoch_loss}, Accuracy: {epoch_acc}\n")
            f.write(f"Time for epoch {epoch_num+1}: {epoch_time:.2f} seconds\n")


        save_model(model, epoch_num, save_path)


In [None]:
train_SRC,output_labels = extract_sentences()
# train_SRC = train_SRC[:100000]
# output_labels = output_labels[:100000]
print("train_SRC read ",train_SRC[0])
print("output_labels read ",output_labels[0])
print("train_SRC length ",len(train_SRC))
print("output_labels length ",len(output_labels))
print("checkpoint 0")

ut_labels = read_unique_labels('./unique_labels.txt')

t_labels = {}
t_labels['0'] = 0
for i in range(len(ut_labels)):
    t_labels[ut_labels[i]] = i+1

train_SRC_size = len(train_SRC)

longest_sentence = 25
print("checkpoint 1")
#tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]
for i in range(len(output_labels)):
    output_labels[i] = output_labels[i].split()
tag_indices = [[t_labels[tag] for tag in sentence_tags] for sentence_tags in output_labels]

model = NER(longest_sentence, hidden_size=64, embedding_dim=100,num_layers=1,
            dropout=0,bidirectional=True,batch_first=True,use_attention=True)
print("training src",train_SRC[0])
train_dataset = NERDataset(train_SRC, tag_indices, longest_sentence)
print("model created")
print("training starting")
print("checkpoint 2")
print("---------------------------------------------")
train(model, train_dataset, batch_size=4096, epochs=5, learning_rate=0.001,norm_clip=0.1)


 24%|██▍       | 143/600 [01:35<05:10,  1.47it/s]