# Responsible Prompting

## Recipe: Populate embeddings


### Imports

In [250]:
import os
import os.path
from dotenv import load_dotenv

import re
import requests
import json
import warnings
import math
import numpy as np
import pandas as pd

### Loading hugging face token from .env file

In [251]:
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')
HF_URL = os.getenv('HF_URL')

In [252]:
# Double checking that the content loaded correctly
print(f"HF URL: {HF_URL}")

HF URL: https://api-inference.huggingface.co/pipeline/feature-extraction/


### Sentence transformer model ids (from hugging face)

In [253]:
# These codes will be used in the hugging face request headers.
# If you want to add more models, this is the place
model_ids = [
    # "sentence-transformers/all-MiniLM-L6-v2", 
    # "ibm-granite/granite-embedding-30m-english", 
    "BAAI/bge-large-en-v1.5",
    "intfloat/multilingual-e5-large"
]

### Functions

In [254]:
# Converts model_id into filenames
def model_id_to_filename( model_id ):
    return model_id.split('/')[1].lower()

# Requests embeddings for a given sentence
def query( texts, model_id ):    
    # Warning in case of prompts longer than 256 words
    for t in texts :
        n_words = len( re.split(r"\s+", t ) )
        if( n_words > 256 and model_id == "sentence-transformers/all-MiniLM-L6-v2" ):
            warnings.warn( "Warning: Sentence provided is longer than 256 words. Model all-MiniLM-L6-v2 expects sentences up to 256 words." )    
            warnings.warn( "Word count: {}".format( n_words ) ) 

    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
    print( "Request url: " + api_url )
    response = requests.post(api_url, headers=headers, json={"inputs": texts })

    # Debugging API HTTP responses
    # print(f'Status Code: {response.status_code}')
    # print('Headers:')
    # for key, value in response.headers.items():
    #     print(f'  {key}: {value}')
    # print('Content:')
    # print(response.text)
    
    out = response.json() 
    # making sure that different transformers retrieve the embedding
    if( 'error' in out ):
        return out
    while( len( out ) < 384 ): # unpacking json responses in the form of [[[embedding]]]
        out = out[0]
    return out

# Returns the centroid for a given value
def get_centroid( v, dimension = 384, k = 10 ):
    centroid = [0] * dimension
    count = 0            
    for p in v['prompts']:
        i = 0
        while i < len( p['embedding'] ):
            centroid[i] += p['embedding'][i]
            i += 1
        count += 1
    i = 0
    while i < len( centroid ):
        centroid[i] /= count
        i += 1

    return centroid    

### Populating JSON files

In [255]:
# JSON folder
json_folder = '../prompt-sentences-main/'

# INPUT FILE
# Default file with empty embeddings
json_in_file = json_folder + 'prompt_sentences.json' 

# trying to open the input file first
if( os.path.isfile( json_in_file ) ):    
    prompt_json_in = json.load( open( json_in_file ) )

for model_id in model_ids:
    # OUTPUT FILE
    json_out_file_suffix = model_id_to_filename( model_id )
    json_out_file = f"{json_folder}prompt_sentences-{json_out_file_suffix}.json"

    # Trying to open the files first
    if( os.path.isfile( json_out_file ) ):    
        prompt_json_out = json.load( open( json_out_file ) )
        print( 'Opening existing file: ', json_out_file )

        # API request test
        api_response_dimensions = len( query( ['testing API endpoint'], model_id ) )
        print( f"Dimensions from hugging face API response: {api_response_dimensions}" )
        json_file_dimensions = len( prompt_json_out['positive_values'][0]['prompts'][0]['embedding'] )
        print( f"Dimensions from json file: {json_file_dimensions}" )
        if( api_response_dimensions != json_file_dimensions ):
            warnings.warn( f"Dimensions are different: API={api_response_dimensions} while JSON sentences file={json_file_dimensions}" )     
    else:
        # Creating an empty file for new transformer
        with open( json_out_file, 'w') as outfile: 
            json.dump( prompt_json_in, outfile)
        prompt_json_out = json.load( open( json_out_file ) )
        print( 'Creating a new file: ', json_out_file )

    ############################
    # Generate a new output file using the hashmap as auxiliary table hosting old and new/changed embeddings
    ############################
    
    # Using the output json with the prompts and embeddings
    # prompt_json_out
    
    # Create a hashmap with a key value containing a hash for the prompt and the already populated embedding
    prompts_embeddings = {}
    new_prompts = 0
    old_prompts = 0
    errors = 0
    successes = 0
    
    for v in prompt_json_out['positive_values']:
        for p in v['prompts']:
            if( p['embedding'] != [] ):
                prompts_embeddings[ p['text'] ] = p['embedding']
    
    for v in prompt_json_out['negative_values']:
        for p in v['prompts']:
            if( p['embedding'] != [] ):
                prompts_embeddings[ p['text'] ] = p['embedding']
            
    # Loading all prompts from the input json, potentially with new/changed sentences
    # prompt_json_in
    
    # Iterate over the two lists, looking only for new/changed prompts that require the API request for embeddings
    for v in prompt_json_in['positive_values']:
        for p in v['prompts']:
            if( p['text'] in prompts_embeddings ):
                # Prompt found, no need to request embeddings
                p['embedding'] = prompts_embeddings[ p['text'] ]
                old_prompts += 1
            else:
                # Requesting embedding for new/changed prompt
                embedding = query( p['text'] )
                if( 'error' in embedding ):
                    errors += 1
                else: 
                    # Add the new/changed prompt to the hashmap
                    prompts_embeddings[ p['text'] ] = embedding
                    
                    # Using the new hash
                    p['embedding'] = prompts_embeddings[ p['text'] ]
                    successes += 1
                new_prompts += 1
            
    for v in prompt_json_in['negative_values']:
        for p in v['prompts']:
            if( p['text'] in prompts_embeddings ):
                # Prompt found, no need to request embeddings
                p['embedding'] = prompts_embeddings[ p['text'] ]
                old_prompts += 1
            else:
                # Requesting embedding for new/changed prompt
                embedding = query( p['text'] )
                if( 'error' in embedding ):
                    errors += 1
                else: 
                    # Add the new/changed prompt to the hashmap
                    prompts_embeddings[ p['text'] ] = embedding
                    
                    # Using the new hash
                    p['embedding'] = prompts_embeddings[ p['text'] ]
                    successes += 1
                new_prompts += 1
    
    print( 'Old prompts: ', old_prompts )
    print( 'New prompts: ', new_prompts )
    print( 'Errors: ', errors )
    print( 'Successes: ', successes )

    # After all the embeddings are populated (with no errors), compute the centroids for each value
    if( errors == 0 ):
        print( 'Updating centroids.' )
        for v in prompt_json_in['positive_values']:
            v['centroid'] = get_centroid( v, json_file_dimensions, 10 )
        for v in prompt_json_in['negative_values']:
            v['centroid'] = get_centroid( v, json_file_dimensions, 10 )

    # Saving the embeddings for a specific LLM
    with open( json_out_file, 'w') as outfile:
        print( 'Saving into file: ', json_out_file )
        json.dump( prompt_json_in, outfile)   
        print( '\n' )

Opening existing file:  ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json
Request url: https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5
Dimensions from hugging face API response: 1024
Dimensions from json file: 1024
Old prompts:  2194
New prompts:  0
Errors:  0
Successes:  0
Updating centroids.
Saving into file:  ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json


Opening existing file:  ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json
Request url: https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large
Dimensions from hugging face API response: 1024
Dimensions from json file: 1024
Old prompts:  2194
New prompts:  0
Errors:  0
Successes:  0
Updating centroids.
Saving into file:  ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json


