In [None]:
import typer
import advsber
from allennlp.data import Vocabulary

In [None]:
"""
    This function reads the training and validation data, combines them, 
    creates a vocabulary from the combined data, and saves it to disk.
"""

# Function to process dataset, create vocabulary, and save it to files
def create_and_save_vocab(dataset_name: str):
    
    # Initialize the reader for the transactions dataset
    reader = advsber.TransactionsDatasetReader(f'presets/{dataset_name}/discretizers/100_quantile')
    
    # Read the train and validation data
    train_data = reader.read(f'../data/{dataset_name}/lm/train.jsonl')
    valid_data = reader.read(f'../data/{dataset_name}/lm/valid.jsonl')
    
    # Combine the train and validation data
    combined_data = train_data + valid_data
    
    # Define special tokens to add to the vocabulary
    special_tokens = {
        "transactions": ["@@MASK@@", "<START>", "<END>"],
        "amounts": ["<START>", "<END>"]
    }
    
    # Create the vocabulary from the combined data and the special tokens
    vocab = Vocabulary.from_instances(combined_data, tokens_to_add=special_tokens)
    
    # Save the vocabulary to disk
    vocab_file_path = f'./presets/{dataset_name}/vocabs/100_quantile'
    vocab.save_to_files(vocab_file_path)
    
    print(f"Vocabulary for {dataset_name} saved to {vocab_file_path}")

**Use the below cell to execute notebook**

In [None]:
# Run the function for a dataset
dataset_name = "rosbank"  # Replace with the dataset name
create_and_save_vocab(dataset_name)