In [5]:
import glob
import torch
import json
from transformers import AutoTokenizer


def postprocess_and_save_experiences(input_folder_path, model_id):
    """
    This Function decodes the outputs.
    
    :param input_folder_path: The path to the folder containing the .pt files.
    :param model_id: The model id for the tokenizer.
    """
    # List all .pt files in the input directory
    file_paths = glob.glob(os.path.join(input_folder_path, '*.pt'))
    
    # Load the tokenizer
    model_tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Initialize counter
    counter = 0
    
    # Load all tensors into memory
    for file_path in file_paths:
        tensor = torch.load(file_path)
        
        # Decode the tensor
        decoded_tensor = model_tokenizer.batch_decode(tensor, skip_special_tokens=True)
        
        # Get the file name without the extension
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        
        # Create the output file path with .json extension
        output_file_path = os.path.join(input_folder_path, file_name + '.json')
        
        # Save decoded result as JSON
        with open(output_file_path, 'w') as json_file:
            json.dump(decoded_tensor, json_file)
        
        # Up and print profile counter
        counter += 1
        print(counter)

In [None]:
import os
import dotenv


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Run the decoding function
postprocess_and_save_experiences(os.getenv("TAGS_FOLDER_PATH")+'outputs', os.getenv("LLAMA_3_PATH"))

In [3]:
import os
import glob
import json
import torch
from transformers import AutoTokenizer


def postprocess_and_save_experiences_full(prompt_folder_path, response_folder_path, model_id):
    """
    :param prompt_folder_path: The path to the folder containing the prompt .pt files.
    :param response_folder_path: The path to the folder containing the response .pt files.
    :param model_id: The model name of which tokenizer to use for decoding the files.
    """
    # List all .pt files in the prompt directory
    prompt_file_paths = glob.glob(os.path.join(prompt_folder_path, '*.pt'))
    
    # Load the tokenizer
    model_tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Process each prompt file
    for prompt_file_path in prompt_file_paths:
        # Get the file name without the extension
        file_name = os.path.splitext(os.path.basename(prompt_file_path))[0]
        
        # Load the prompt tensor
        prompt_tensor = torch.load(prompt_file_path)
        
        # Decode the prompt tensor
        decoded_prompt = model_tokenizer.batch_decode(prompt_tensor, skip_special_tokens=True)
        
        # Find the corresponding response file
        response_file_path = os.path.join(response_folder_path, file_name + '.pt')
        
        # Check if the response file exists
        if os.path.exists(response_file_path):
            # Load the response tensor
            response_tensor = torch.load(response_file_path)
            
            # Decode the response tensor
            decoded_response = model_tokenizer.batch_decode(response_tensor, skip_special_tokens=True)
            
            # Create a dictionary to store the prompt and response
            experience = {
                'prompt': decoded_prompt,
                'response': decoded_response
            }
            
            # Create the output file path with .json extension
            output_file_path = os.path.join(response_folder_path, file_name + '.json')
            
            # Save the prompt and response as JSON
            with open(output_file_path, 'w') as json_file:
                json.dump(experience, json_file)
        else:
            print(f"Response file not found for prompt: {file_name}")

In [4]:
import os
import dotenv


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Run the decoding function
postprocess_and_save_experiences_full(
    os.path.join(os.getenv("TAGS_FOLDER_PATH"), 'inputs/'), 
    os.path.join(os.getenv("TAGS_FOLDER_PATH"), 'outputs/'),
    os.getenv("LLAMA_3_PATH")
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
