# 🔍 Qualitative evaluation

This is an old script that we used to load the embeddings, stored locally with FAISS. Then, for each of them, we retrieced the top-3 abstracts based on cosine similarity on 10 spceific queries. This queries are all taken from the QA dataset; the first 8 are the most dissimilar among themselves in cosine distance, the last two are the longest and the shortest. You can read more about this in the [DOCUMENTATION](./../../../DOCUMENTATION.md). \
The answers are then stored in txt files and are afterwards analysed manually by all three of us with a complex scheme inside [qualitative_evaluation_table.xlsx](./results/qualitative_evaluation_table.xlsx)

#### ⚠️ Disclaimer
Storing all the embeddings on git, when for the final chatbot we only use thnelper/gte-base, would have been very heavy even for [Git Large File Storage](https://git-lfs.com). Therefore we now provide a [link to a shared folder on Google Drive](https://drive.google.com/drive/folders/13lNkKgz5w40YYdsSWG-aqE8E5fdErQ0Z?usp=share_link). If you wanted to replicate this analysis yourselves, you will only have to download all the other embeddings from the faiss-indices folder, and paste them inside [this project's homonimus folder](./../../../data/embeddings/faiss_indices/). There you also find all the generated txt files.

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
import random
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import os
import pprint as pprint


# Set seed for vector database
random.seed(42)
torch.manual_seed(42)

# Check device
has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

if has_gpu:
    torch.cuda.manual_seed_all(42)

mps


In [3]:
# Extract gold labels and queries
eval_data = pd.read_csv('../../../data/evaluation_data/retrieval_eval_data/questions_answers.csv')
eval_data.rename(columns={eval_data.columns[0]: 'PMID'}, inplace=True)

gold_pmids = eval_data['PMID'].to_list()
eval_queries = eval_data['QUESTION'].to_list()

In [5]:
# Load embeddings
#model_id = 'BAAI/bge-base-en-v1.5'
#model_id = 'llmrails/ember-v1'
#model_id = 'intfloat/e5-base-v2'
#model_id = 'dmis-lab/biobert-base-cased-v1.1'
#model_id = 'jamesgpt1/sf_model_e5'
model_id = 'thenlper/gte-base'
#model_id = 'sentence-transformers/all-MiniLM-L6-v2'



file_path = model_id.replace("/", "_")

model_kwargs = {'device': device}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(model_name=model_id, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

new_db = FAISS.load_local(f"../../../data/embeddings/faiss_indices/{model_id}", embeddings)

In [6]:
# Select wide range of queries in the eval set

query_embeddings = np.array([embeddings.embed_query(query) for query in eval_data['QUESTION']])

# Compute cosine dissimilarity matrix
similarity_matrix = 1 - cosine_similarity(query_embeddings)

# Fill the diagonal with large negative values to avoid self-selection
np.fill_diagonal(similarity_matrix, -np.inf)

# Find indices of the least similar queries
dissimilarity_sums = similarity_matrix.sum(axis=1)
most_dissimilar_indices = np.argsort(dissimilarity_sums)[-8:]

# Select the most dissimilar queries and their PMIDs
selected_queries_and_pmids = eval_data[['QUESTION', 'PMID']].iloc[most_dissimilar_indices]

# Find the shortest and longest queries by character count and their PMIDs
shortest_query_index = eval_data['QUESTION'].str.len().idxmin()
longest_query_index = eval_data['QUESTION'].str.len().idxmax()
shortest_query_and_pmid = eval_data[['QUESTION', 'PMID']].iloc[shortest_query_index]
longest_query_and_pmid = eval_data[['QUESTION', 'PMID']].iloc[longest_query_index]


In [7]:
# Append the shortest and longest queries with their PMIDs to the selected queries
selected_queries_and_pmids = pd.concat([selected_queries_and_pmids, pd.DataFrame([shortest_query_and_pmid, longest_query_and_pmid])])

# Print the selected queries and PMIDs
pprint.pprint(selected_queries_and_pmids.values.tolist())

[['Is occupational outcome in bipolar disorder predicted by premorbid '
  'functioning and intelligence?',
  23527993],
 ['Is bilateral hearing loss associated with decreased nonverbal intelligence '
  'in US children aged 6 to 16 years?',
  24913183],
 ['Is cognitive ability in early adulthood associated with later suicide and '
  'suicide attempt : the role of risk factors over the life course?',
  22617391],
 ['Does parenting behavior at 2 years predict school-age performance at 7 '
  'years in very preterm children?',
  26616792],
 ['Is excess of runs of homozygosity associated with severe cognitive '
  'impairment in intellectual disability?',
  25232855],
 ['Does randomised trial of early neonatal hydrocortisone demonstrate '
  'potential undesired effects on neurodevelopment at preschool age?',
  26058477],
 ['Is prenatal Micronutrient Supplementation Associated with Intellectual '
  'Development of Young School-Aged Children?',
  26084366],
 ['Is maternal stress during pregnanc

In [8]:
gold_pmids = selected_queries_and_pmids['PMID'].to_list()
selected_queries = selected_queries_and_pmids['QUESTION'].to_list()

# Function to write the evaluation results to a text file
def write_evaluation_results(eval_data, gold_pmids, filename):
    with open(filename, 'w') as file:
        retriever = new_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

        file.write(f"Results for k=3\n")
        file.write("----------\n")

        for query, gold_label in zip(eval_data['QUESTION'], gold_pmids):
            top_k_results = retriever.get_relevant_documents(query)
            retrieved_pmids = [int(result.metadata['PMID']) for result in top_k_results]

            file.write(f"Query: {query}\n\n")
            for idx, result in enumerate(top_k_results):
                correct_mark = "(Correct)" if retrieved_pmids[idx] == gold_label else ""
                file.write(f"  Chunk: {result.page_content}... PMID: {retrieved_pmids[idx]} {correct_mark}\n\n")

            file.write("\n")  # Extra newline for readability
        file.write("\n\n")  # Extra newlines to separate sections for different k values


filename = f"retrieval_qualitative_evaluation_{file_path}.txt"
write_evaluation_results(selected_queries_and_pmids, gold_pmids, filename)
