In [1]:
%pip install langchain
%pip install transformers
%pip install sentence-transformers
#%pip install bitsandbytes
%pip install pandas
#%pip install python-dotenv
%pip install -U langchain-community faiss-gpu langchain-openai tiktoken

Collecting langchain
  Downloading langchain-0.1.4-py3-none-any.whl (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/803.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m512.0/803.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.6/803.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.14 (from

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import json


In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
# Set seed for vector database
random.seed(42)
torch.manual_seed(42)

# Check device
has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

if has_gpu:
    torch.cuda.manual_seed_all(42)



# Load data
loader_1 = CSVLoader(
    file_path='processed_data_part1.csv',
    metadata_columns=['PMID', 'Title', 'Authors', 'Publication Date', 'DOI'])
loader_2 = CSVLoader(
    file_path='processed_data_part2.csv',
    metadata_columns=['PMID', 'Title', 'Authors', 'Publication Date', 'DOI'])

loader_all = MergedDataLoader(loaders=[loader_1, loader_2])
docs_all = loader_all.load()

eval_data = pd.read_csv('questions_answers.csv')
eval_data.rename(columns={eval_data.columns[0]: 'PMID'}, inplace=True)


# Evaluate
def evaluate_and_plot(model_name, tokenizer, model, plot_title, file_name):
    # Define embedding model
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': True}
    embed_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

    # Apply text splitting into chunks to prevent truncation of longer abstracts
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def token_len(text):
        tokens = tokenizer.tokenize(text)
        tokens_length = len(tokens)

        return tokens_length

    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,              #target size for each chunk of text
    #                                             chunk_overlap=100,            #specifies how much overlap there should be between consecutive chunks
    #                                             length_function=token_len,   #counts the number of characters in the text using the token_len function
    #                                             is_separator_regex=False,)   #whether the splitter should treat the separators as regular expressions

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=400,
                                                                         chunk_overlap=100,)
    chunked_docs = text_splitter.split_documents(docs_all)


    # Set up Faiss vector database
    db = FAISS.from_documents(chunked_docs, embedding=embed_model)

    # Extract gold labels and queries
    gold_pmids = eval_data['PMID'].to_list()
    eval_queries = eval_data['QUESTION'].to_list()

    accuracies = []

    for k in range(1, 21):
        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
        matches = 0

        for query, gold_label in zip(eval_queries, gold_pmids):
            top_k_results = retriever.get_relevant_documents(query)
            retrieved_pmids = [int(result.metadata['PMID']) for result in top_k_results]

            if gold_label in retrieved_pmids:
                matches += 1

        accuracy = matches / len(eval_queries)
        accuracies.append(accuracy)

    # Plotting the results
    plt.plot(range(1, 21), accuracies, marker='o')
    plt.xlabel('k (Number of Top Results Considered)')
    plt.ylabel('Accuracy')
    plt.title(plot_title)
    plt.xticks(range(1, 21))

    plt.savefig(file_name)


# List of models to evaluate
models_to_evaluate = [
    # {
    #     'model_name': 'Muennighoff/SGPT-125M-weightedmean-nli-bitfit',          #wass too large
    #     'tokenizer': AutoTokenizer.from_pretrained("Muennighoff/SGPT-125M-weightedmean-nli-bitfit"),
    #     'model': SentenceTransformer("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit"),
    #     'plot_title': 'Retriever Model Accuracy with SGPT-125M',
    #     'file_name': 'retriever_accuracy_sgpt_msmarco.png'
    # },
    # {
    #     'model_name': 'dmis-lab/biobert-base-cased-v1.1',
    #     'tokenizer': AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1"),
    #     'model': AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1"),
    #     'plot_title': 'Retriever Model Accuracy with BioBERT',
    #     'file_name': 'retriever_accuracy_biobert.png'
    # },
    # {
    #     'model_name': 'intfloat/e5-base-v2',
    #     'tokenizer': AutoTokenizer.from_pretrained("intfloat/e5-base-v2"),
    #     'model': SentenceTransformer("intfloat/e5-base-v2"),
    #     'plot_title': 'Retriever Model Accuracy with e5-base-v2',
    #     'file_name': 'retriever_accuracy_e5-base-v2.png'
    # },
    # {
    #     'model_name': 'BAAI/bge-base-en-v1.5',
    #     'tokenizer': AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5"),
    #     'model': SentenceTransformer("BAAI/bge-base-en-v1.5"),
    #     'plot_title': 'Retriever Model Accuracy with bge-base-en-v1.5',
    #     'file_name': 'retriever_accuracy_bge-base-en-v1.5.png'
    # },
    {
        'model_name': 'llmrails/ember-v1',
        'tokenizer': AutoTokenizer.from_pretrained("llmrails/ember-v1"),
        'model': SentenceTransformer("llmrails/ember-v1"),
        'plot_title': 'Retriever Model Accuracy with llmrails/ember-v1',
        'file_name': 'retriever_accuracy_llmrails/ember-v1.png'
    },

]


# Loop over the models
for model_info in models_to_evaluate:
    evaluate_and_plot(model_info['model_name'], model_info['tokenizer'], model_info['model'], model_info['plot_title'], model_info['file_name'])



cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]