In [1]:
import pandas as pd
import ast

excel_path = 'Policy.xlsx'
# Loading policy from Excel
policy_df = pd.read_excel(excel_path,sheet_name= 0, engine='openpyxl')
prompt_df = pd.read_excel(excel_path,sheet_name= 1, engine='openpyxl')
label_df = pd.read_excel(excel_path,sheet_name= 2, engine='openpyxl')

policy_scripts = policy_df['Policy Scripts'].tolist()
def parse_labels(label_str):
    try:
        # Safely evaluate the string as a Python literal (list in this case)
        return ast.literal_eval(label_str)
    except ValueError:
        # In case of an error (e.g., malformed string), return an empty list or handle accordingly
        return []

# Apply the conversion to each row in the 'CorrectLabels' column
label_df['True'] = label_df['label list'].apply(parse_labels)


In [2]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Gemma 2B
Token = "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"
global tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=Token)
#model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",token=Token,
#    trust_remote_code=True, device_map="cpu", torch_dtype=torch.float16)
from transformers import LongformerTokenizer, LongformerModel

#tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
#model = LongformerModel.from_pretrained('allenai/longformer-base-4096')


In [4]:
%pip install ollama



Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import ollama
embedding = []
for i in range(len(policy_df)):
    policy_emb = ollama.embeddings(model='nomic-embed-text', prompt=policy_scripts[i])
    embedding.append(policy_emb['embedding'])
    


In [4]:
import faiss
import numpy as np
# Build FAISS index
doc_embeddings = np.array(embedding, dtype='float32')
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

In [5]:
# Sample query
query = prompt_df['Prompt'][5]

# Generate BERT embedding for the query
query_embedding = ollama.embeddings(model='nomic-embed-text', prompt=query)['embedding']
query_array = np.array(query_embedding)
query_embedding = query_array.reshape(1, 768)

k = 10

# Perform similarity search
_, indices = index.search(query_embedding, k)
similar_documents_indices = indices.flatten().tolist()

# Get similar documents
similar_documents = [policy_scripts[i] for i in similar_documents_indices]

# Print the actual documents
print(query)
for idx in indices[0]:
    print(f"\n### Retrieved Document {idx}:\n{policy_scripts[idx][:166]}" )


: 

In [7]:

def evaluate_retrieval(search_result, true_labels):
    """
    Check if all true labels are contained within the predicted labels.
    Args:
    predicted_labels (list): The labels retrieved by the search model.
    true_labels (list): The correct labels listed in the Excel file.

    Returns:
    bool: True if all true labels are in the predicted labels, False otherwise.
    """
    #force the input to be sets;
    y_pred = set(search_result)
    y_true = set(true_labels)

    # check coverage
    correct = y_true.intersection(y_pred)
    
    # Calculate coverage
    coverage = len(correct) / len(y_true)

    return y_true.issubset(y_pred), coverage

def k_top_search(upper_thres, tokenizer, index, prompt_df, label_df, lower_thres = 5):
    """
    loop through all k in a range, from lower thres (5 by default) to upper thres,
    giving a figure showing the accuracy, coverage and average token number over different k
    Args:
    upper_thres (int): Number of top searches upper limit
    lower_thres (int): Number of top searches lower limit
    vectorizer: Tfidvectorizer, fitted
    index: faiss object after index addition
    prompt_df: pd df, must contain 'Prompt' column for queries
    label_df: pd df, must contain 'True' column for true labels
    Returns:
    null
    """
    # lists for plots
    accuracies = []
    coverages = []
    num_tokens = []

    for k in range(lower_thres, upper_thres+1):
        indices_list = []
        token_count = 0

        for query in prompt_df['Prompt']:

            query_embedding = ollama.embeddings(model='nomic-embed-text', prompt=query)['embedding']
            query_array = np.array(query_embedding)
            query_embedding = query_array.reshape(1, 768)
            _, indices = index.search(query_embedding, k)
            indices_list.append(indices.flatten().tolist())

            # combining the full query with full searched docs
            combined_query = query
            for temp in indices.flatten().tolist():
                combined_query = combined_query + ' ' + policy_scripts[temp]

            # tokenize
            tokens = tokenizer.tokenize(combined_query)
            
            # Return the number of tokens
            token_count += len(tokens)

        # Adding search results for further check
        prompt_df['TopIndices'] = indices_list


        #evaluate
        accu_count = 0
        accu_cover_count = 0
        for i in range(len(prompt_df)):
            temp, cover = evaluate_retrieval(prompt_df['TopIndices'][i] , label_df['True'][i])
            accu_count += int(temp)
            accu_cover_count += cover

        accuracy_1 = accu_count/len(prompt_df)
        coverage_1 = accu_cover_count/len(prompt_df)
        print(f"For top {k} searches:\nAccuracy of search results containing all correct labels: {accuracy_1 * 100},\n Average coverage of correct labels: {coverage_1 * 100}")

        token_1 = token_count/len(prompt_df)
        print(f"Average tokens combining the query and retrieved docs: {token_1}")

        # Add to the lists
        accuracies.append(accuracy_1)
        coverages.append(coverage_1)
        num_tokens.append(token_1)
        
    return accuracies, coverages, num_tokens


In [8]:

upper_search = 20
a,b,c = k_top_search(upper_thres= upper_search,tokenizer=tokenizer, index = index, prompt_df = prompt_df, label_df = label_df)


: 

In [None]:
import matplotlib.pyplot as plt

# Plot acc and cover.
plt.figure(figsize=(8, 6))
plt.plot(range(5, upper_search+1), a, label='Accuracy (Fully Match)')
plt.plot(range(5, upper_search+1), b, label='Average Coverage')
plt.xlabel('Num of Searched Results')
plt.ylabel('Percentage')
plt.title('Fully Match Accuracy and Average Coverage vs. Num of Searched Results Included')
plt.legend()
plt.savefig('images/acc_2.png')  # Save the plot to a specific path
#plt.close()  # Close the figure to release memory


# Plot number of tokens
plt.figure(figsize=(8, 6))
plt.plot(range(5, upper_search+1), c, color='green')
plt.xlabel('Num of Searched Results')
plt.ylabel('Total Number of Tokens')
plt.title('Number of Tokens vs. k')
plt.savefig('images/num_tokens_2.png')  # Save the plot to a specific path
#plt.close()  # Close the figure to release memory
indices_list = []
token_count = 0
for query in prompt_df['Prompt']:

    query_vector = vectorizer.transform([query]).toarray()
    query_vector = np.array(query_vector, dtype='float32')
    _, indices = index.search(query_vector, k)
    indices_list.append(indices.flatten().tolist())

    combined_query = query
    for temp in indices.flatten().tolist():
        combined_query = combined_query + ' ' + policy_scripts[temp]

    # tokenize
    tokens = tokenizer.tokenize(combined_query)
    
    # Return the number of tokens
    token_count += len(tokens)

# Adding search results for further check
prompt_df['Top3Indices'] = indices_list

# Save to new sheet, only run once
with pd.ExcelWriter(excel_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
    prompt_df[['Prompt','Top3Indices']].to_excel(writer, sheet_name='Eval_result1')

accu_count = 0
accu_cover_count = 0
for i in range(len(prompt_df)):
    temp, cover = evaluate_retrieval(prompt_df['Top3Indices'][i] , label_df['True'][i])
    accu_count += int(temp)
    accu_cover_count += cover

accuracy_1 = accu_count/len(prompt_df)
coverage_1 = accu_cover_count/len(prompt_df)
print(f"Accuracy of search results containing all correct labels: {accuracy_1 * 100},\n Average coverage of correct labels: {coverage_1 * 100}")

token_1 = token_count/len(prompt_df)
print(f"Average tokens combining the query and retrieved docs: {token_1}")
# make a Query
query_text = "Does GNEI provide travel insurance? Receipts and prior approval required"
query_vector = vectorizer.transform([query_text]).toarray()
query_vector = np.array(query_vector, dtype='float32')

# Searching the index
k = 10  # Number of nearest neighbors
distances, indices = index.search(query_vector, k)

print("Distances:", distances.flatten())
print("Indices:", indices.flatten())

indices[0]
import matplotlib.pyplot as plt

# Retrieve and plot the distances
plt.figure(figsize=(10, 6))
plt.bar(np.sort(indices[0]), distances[0])
plt.title('FAISS Retrieval Distances')
plt.xlabel('Index of Retrieved Documents')
plt.ylabel('Distance')
plt.show()

# Print the actual documents
for idx in indices[0]:
    print(f"\n### Retrieved Document {idx}:\n{policy_scripts[idx][:100]}" )
