# Language Model training & model selection

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import math
import re
from tqdm import tqdm
import torchtext
import copy

torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


### Load train, validation and test datasets

In [2]:
data_folder = 'datasets/LM-training-datasets'
train_df = pd.read_csv(os.path.join(data_folder, 'train.csv'))
validation_df = pd.read_csv(os.path.join(data_folder, 'validation.csv'))
test_df = pd.read_csv(os.path.join(data_folder, 'test.csv'))

### Setting global features and hyperparameters

In [3]:
# Global features
n_epochs = 200
MAX_DEPTHS = [5,10] # max depth of each path
MIN_FREQS = [3,5] # minimum frequency of each word in the vocabulary

# Hyperparameters to test(each variable is a list of possible values)
embedding_size = [128, 256,512]
n_layers = [2,3,4]
dropout_rate = [0.2,0.4,0.6]

# Combinations of hyperparameters
hyperparameters = [(es, nl, dr) for es in embedding_size for nl in n_layers for dr in dropout_rate]

# Other hyperparameters (not to be tested)
batch_size = 128
lr=1e-3
tie_weights = True # if True, helps to reduce the number of parameters
                   # if True, hidden_dim = embedding_dim


### Data preprocessing

In [4]:
# Custom tokenizer to prepare the data

year_token='YEAR'

def custom_tokenizer(path,MAX_DEPTH):

    # remove leading slash
    path=path.lstrip('/')

    # Split the path into words
    path_words = path.split('/')

    # Trim the path to MAX_DEPTH
    path_words = path_words[:MAX_DEPTH]

    #YEAR token substitution for 4-digit numbers
    for i,tok in enumerate(path_words):
        pattern = r'^\d{4}$'
        if re.match(pattern, tok):
            path_words[i]=year_token

    return path_words

# Function to yield tokens from the DataFrame
def yield_tokens(data_iter,MAX_DEPTH):
    for path in data_iter:
        yield custom_tokenizer(path, MAX_DEPTH)



In [5]:
# Create the vocabulary
def create_vocabulary(MIN_FREQ,MAX_DEPTH):
    vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_df['Path'],MAX_DEPTH), min_freq=MIN_FREQ)
    vocab.insert_token('<unk>', 0)
    vocab.insert_token('<eos>', 2)
    vocab.insert_token('<sos>', 1)
    vocab.insert_token('<pad>',3)
    vocab.set_default_index(vocab['<unk>'])
    # print(f'len vocab = {len(vocab)}')
    return vocab

In [6]:
# Dataloaders
def get_dataloader(tokens, vocab, batch_size, seq_len):
    data = []
    for token_list in tokens:
        token_list.append('<eos>')
        token_list=['<sos>']+token_list
        while len(token_list)<seq_len:
            token_list.append('<pad>')
        mapped_tokens= [vocab[token] for token in token_list]
        data.extend(mapped_tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches)
    return data

def get_dataloaders(MIN_FREQ,MAX_DEPTH,seq_len):
  vocab=create_vocabulary(MIN_FREQ,MAX_DEPTH)
  train_data = get_dataloader([custom_tokenizer(url, MAX_DEPTH) for url in train_df['Path']], vocab, batch_size, seq_len)
  valid_data = get_dataloader([custom_tokenizer(url , MAX_DEPTH) for url in validation_df['Path']], vocab, batch_size, seq_len)
  return train_data,valid_data

### Language Model definition, training and evaluation functions

In [7]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate,
                tie_weights):

        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.tie_weights = tie_weights

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Dropout between embedding and lstm
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                    dropout=dropout_rate, batch_first=True)

        # Dropout between lstm and fc
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)

        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction, hidden

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell


    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [8]:
# get_batch and train (one epoch) functions
def get_batch(data, seq_len, num_batches, idx):
    src = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]
    return src, target

def get_batch(data, seq_len, num_batches, idx):
    src = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]
    return src, target


def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):

    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    for idx in range(0, num_batches - 1, seq_len):  # The last batch can't be a src
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)

        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches


In [9]:
# Evaluation function (one epoch) (used for early stopping)
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

### Training


In [10]:
# Training function for one set of hyperparameters

def train_model(n_epochs, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights, MIN_FREQ, MAX_DEPTH, device, batch_size, lr):
    seq_len=MAX_DEPTH + 2 # MAX DEPTH + sos and eos tokens
    vocab = create_vocabulary(MIN_FREQ,MAX_DEPTH)
    vocab_size=len(vocab)
    train_data, valid_data = get_dataloaders(MIN_FREQ,MAX_DEPTH,seq_len)

    model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

    # Used to reduce the learning rate by a factor of 2 after every epoch associated with no improvement in the validation loss
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)
    clip= 0.25

    # Early stopping
    best_valid_loss = float('inf')
    early_stopping_patience = 10 # Number of epochs to wait before stopping
    early_stopping_counter = 0 # Counter for early stopping
    best_model_state_dict=copy.deepcopy(model.state_dict())

    for epoch in tqdm(range(1, n_epochs+1)):
        train_loss = train(model, train_data, optimizer, criterion,
                    batch_size, seq_len, clip, device)
        valid_loss = evaluate(model, valid_data, criterion, batch_size,
                    seq_len, device)

        lr_scheduler.step(valid_loss)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model_state_dict=copy.deepcopy(model.state_dict())
            early_stopping_counter = 0 # Reset the counter if the validation loss improves
        else:
            early_stopping_counter += 1 # Increment the counter if the validation loss does not improve

        # print(f'Epoch {epoch}, train_loss = {train_loss:.4f}, valid_loss = {valid_loss:.4f}')

        # Check if early stopping condition is met
        if early_stopping_counter >= early_stopping_patience:
            # print(f"Early stopping at epoch {epoch} due to no improvement in validation loss for {early_stopping_patience} epochs.")
            break # Stop the training loop

    return best_model_state_dict, best_valid_loss


In [None]:
# Train for every hyperparameter combination
results = []
best_models = {}
global_param_combinations= [(max_depth,min_freq) for max_depth in MAX_DEPTHS for min_freq in MIN_FREQS]


for MAX_DEPTH, MIN_FREQ in global_param_combinations:
    for hyperparameter in hyperparameters:
        print(f'Hyperparameter combination: MAX_DEPT={MAX_DEPTH}, MIN_FREQ={MIN_FREQ}, {hyperparameter}')
        embedding_dim, num_layers, dropout_rate = hyperparameter
        model_state_dict, valid_loss = train_model(n_epochs, embedding_dim, embedding_dim, num_layers, dropout_rate, tie_weights, MIN_FREQ, MAX_DEPTH, device, batch_size, lr)
        results.append((MAX_DEPTH, MIN_FREQ, hyperparameter, valid_loss, model_state_dict))

        # Check if this model has the lowest validation loss for the current global parameter combination
        if (MAX_DEPTH, MIN_FREQ) not in best_models or valid_loss < best_models[(MAX_DEPTH, MIN_FREQ)][1]:
            # Save the model state dictionary and validation loss
            best_models[(MAX_DEPTH, MIN_FREQ)] = (hyperparameter, valid_loss, model_state_dict)


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 2, 0.2)


  5%|▌         | 10/200 [5:34:15<105:51:03, 2005.60s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 2, 0.4)


  5%|▌         | 10/200 [5:23:37<102:28:47, 1941.72s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 2, 0.6)


  6%|▌         | 11/200 [5:59:07<102:50:31, 1958.90s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 3, 0.2)


  5%|▌         | 10/200 [5:46:11<109:37:30, 2077.11s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 3, 0.4)


  5%|▌         | 10/200 [5:24:34<102:46:48, 1947.41s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 3, 0.6)


  6%|▌         | 11/200 [5:48:52<99:54:11, 1902.92s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 4, 0.2)


  5%|▌         | 10/200 [5:39:12<107:24:52, 2035.23s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 4, 0.4)


  5%|▌         | 10/200 [5:32:04<105:09:23, 1992.44s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (128, 4, 0.6)


  6%|▌         | 11/200 [6:31:14<112:02:11, 2134.03s/it]


Hyperparameter combination: MAX_DEPT=5, MIN_FREQ=3, (256, 2, 0.2)


  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
# Assuming results variable is already populated with the training results
results_folder = 'saved_models'

# Create the folder if it doesn't exist
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

# OPTION 1: SAVE ONLY BEST MODEL FOR EVERY COMBINATION OF GLOBAL PARAMS:

for (MAX_DEPTH, MIN_FREQ), (hyperparameter, valid_loss, model_state_dict) in best_models.items():
    embedding_dim, num_layers, dropout_rate = hyperparameter
    # Define the filename based on the global parameter combination
    filename = f'model_MD{MAX_DEPTH}_MF{MIN_FREQ}_es{embedding_dim}_nl{num_layers}_dr{dropout_rate}_loss{valid_loss:.6f}.pt'
    filepath = os.path.join(results_folder, filename)
    torch.save(model_state_dict, filepath)

# # OPTION 2: SAVE EVERY MODEL
# for result in results:
#     MAX_DEPTH, MIN_FREQ, hyperparameter, valid_loss, model_state_dict = result
#     embedding_dim, num_layers, dropout_rate = hyperparameter

#     # Save the model with a filename indicating its parameters
#     filename = f'model_MD{MAX_DEPTH}_MF{MIN_FREQ}_es{embedding_dim}_nl{num_layers}_dr{dropout_rate}_loss{valid_loss:.6f}.pt'
#     filepath = os.path.join(results_folder, filename)
#      torch.save(model_state_dict, filepath)

In [None]:
def generate(prompt, MAX_DEPTH, model, custom_tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = custom_tokenizer(prompt, MAX_DEPTH)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(5):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            # Directly modify the prediction vector to set EOS probability to 0
            
            eos_index = vocab['<eos>'] # As per your vocab setup
            prediction[:, -1, eos_index] = -float('inf')

            sos_index = vocab['<sos>'] # As per your vocab setup
            prediction[:, -1, sos_index] = -float('inf')

            probs = torch.softmax(prediction[:, -1], dim=-1)

            # Get the top 5 probabilities and their indices
            token_prob_pairs = [(vocab.get_itos()[index.item()], prob.item()) for index, prob in zip(torch.arange(probs.size(0)), probs.squeeze())]

            # Return all tokens and their probabilities as a list of tuples
            return token_prob_pairs


            # Note: The original loop structure that appends to indices and continues generating
            # has been simplified for this example. You might need to adjust this part based on your specific requirements.


In [None]:
models_folder='saved_models'
for file in os.listdir(models_folder):
    model_state_dict=torch.load(f'saved_models/{file}',map_location=torch.device(device))
    
    # load hyperparameters from the filename
    model_name_without_extension = file.split('.pt')[0]

    # Split the model name by underscore to get the components
    components = model_name_without_extension.split('_')

    # Extract the hyperparameters
    model_prefix = components[0] # This is usually the model type or identifier
    MAX_DEPTH = int(components[1][2:]) # Extract the number after 'MD'
    MIN_FREQ = int(components[2][2:]) # Extract the number after 'MF'
    embedding_size = int(components[3][2:]) # Extract the number after 'es'
    num_layers = int(components[4][2:]) # Extract the number after 'nl'
    dropout_rate = float(components[5][2:]) # Extract the number after 'dr'
    loss = float(components[6][4:])
    vocab= create_vocabulary(MIN_FREQ,MAX_DEPTH)
    print(f'MODEL: {file}, MAX_DEPTH: {MAX_DEPTH}, MIN_FREQ: {MIN_FREQ}, embedding_size: {embedding_size}, num_layers: {num_layers}, dropout_rate: {dropout_rate}, loss: {loss}')
    print(f'vocab_size = {len(vocab)}')
    vocab_size = len(vocab)
    tie_weights = True

    model = LSTM(vocab_size, embedding_size, embedding_size, num_layers, dropout_rate, tie_weights).to(device)
    model.load_state_dict(model_state_dict)
    model.eval()
    prompts=['<sos>']
    # Softmax, put eos to 0% and ask again for probabilities
    top_tokens, top_probs = generate(prompts[0],MAX_DEPTH,model,custom_tokenizer,vocab,device,0)
    embedded_tokens_probs = [(top_tokens[i], f'{top_probs[0][i]*100:.2f}%') for i in range(5)]
    print(f'DEPTH 1, starting token {prompts[0]}, predictions (tokens,probs): {embedded_tokens_probs}')

    for tok, _ in embedded_tokens_probs:
        url='<sos>/'+tok
        print("Depth 2, prompt:",url)
        top_tokens_loc, top_probs_loc = generate(url,30,model,custom_tokenizer,vocab,device,0)
        embedded_tokens_probs_loc = [(top_tokens_loc[i], f'{top_probs_loc[0][i]*100:.2f}%') for i in range(5)]
        print(f'predictions (tokens,probs): {embedded_tokens_probs_loc}')
    print('\n\n')