In [None]:
!pip -q install langchain cohere huggingface_hub sentence_transformers tiktoken chromadb lark gdown 

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
import torch
import os
import gc

In [None]:
RAW_DATA_PATH = 'SE2024/train_gpt-4_raw_RAG.csv'
TEST_SPLIT_RAW_PATH = 'SE2024/test_split_raw.csv'
VECTORIZE_DB_PERSIST_DIRECTORY = 'SE2024/vdb/content'
RESULT_CSV_PATH = "SE2024/02_ranker_rag_shots.csv"
NUMBER_OF_RETRIEVED_DOCS = 30
NUMBER_OF_RANKER_DOCS = 7
MAX_SHOTS = 3

In [None]:
os.makedirs("./SE2024", exist_ok=True)

In [None]:
os.environ["COHERE_API_KEY"] = "cohere api key"

# Prepare RAG's datastore

In [None]:
if os.path.exists(RAW_DATA_PATH):
    print('Data file already exists')
else:
    print("Data doesn't exist, start download from the google drive...")
    !gdown 15VK8MaOEg2gF8iwmI4bummXt8whZF9Bq -O $RAW_DATA_PATH

## Load Documents

In [None]:
doc_loader = CSVLoader(
    file_path=RAW_DATA_PATH,
    source_column="ID",
    metadata_columns=["ID"],
)

docs = doc_loader.load()

## Get Embeddings

In [None]:
embedding = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": 'cuda' if torch.cuda.is_available() else 'cpu'},
        encode_kwargs={'normalize_embeddings': True},
)

## Prepare and Persist DB

In [None]:
os.makedirs(VECTORIZE_DB_PERSIST_DIRECTORY, exist_ok=True)

In [None]:
vector_store = Chroma(
    persist_directory=VECTORIZE_DB_PERSIST_DIRECTORY,
    embedding_function=embedding,
)

vector_store.add_documents(docs)

vector_store.persist()

## Delete footprint

In [None]:
del doc_loader, docs, embedding, vector_store
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Find Similar samples

In [None]:
if os.path.exists(TEST_SPLIT_RAW_PATH):
    print('Test\'s data file already exists')
else:
    print("Test\'s data doesn't exist, start download from the google drive...")
    !gdown 1JcpBjTXv2OfaG6uYcIJO-Yk69nT9uN8i -O $TEST_SPLIT_RAW_PATH

In [None]:
if not os.path.exists(VECTORIZE_DB_PERSIST_DIRECTORY):
    print("You need to initialize the vector store first")
    exit()

## Load the data

In [None]:
dataset = pd.read_csv(TEST_SPLIT_RAW_PATH)

## Load the embeddings

In [None]:
embedding = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": 'cuda' if torch.cuda.is_available() else 'cpu'},
        encode_kwargs={'normalize_embeddings': True},
)

## Load the vector store

In [None]:
vector_store = Chroma(
    persist_directory=VECTORIZE_DB_PERSIST_DIRECTORY,
    embedding_function=embedding,
)

## Make retriever

In [None]:
retriever_kwargs = {'k': NUMBER_OF_RETRIEVED_DOCS}

In [None]:
retriever = vector_store.as_retriever(search_kwargs=retriever_kwargs)

## Match the shots using retriever

In [None]:
from cohere import CohereAPIError
from time import sleep

def rate_limit_safe_retriever(retriever, query):
    while True:
        try:
            return retriever.get_relevant_documents(query)
        except CohereAPIError as e:
            print(f"Rate limit reached, waiting for 60 seconds: {e}")
            sleep(60)

In [None]:
def fix_retrieved_shots(docs):
    samples = []
    samples_gid = set()
    for doc in docs:
        index = doc.metadata['ID'].split('_')[0]
        if index in samples_gid:
            continue
        samples.append(doc)
        samples_gid.add(index)
    return samples

In [None]:
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

compressor = CohereRerank(top_n=NUMBER_OF_RANKER_DOCS)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [None]:
from IPython.display import clear_output
data_retrieved = []

for index, row in dataset.iterrows():
    query = row['QUESTION']
    data = {"id": index}
    retrieved = rate_limit_safe_retriever(compression_retriever, query)
    retrieved = fix_retrieved_shots(retrieved)
    retrieved = retrieved[:MAX_SHOTS]
    for i, doc in enumerate(retrieved, start=1):
        data[f"shot {i}"] = doc.metadata['ID']
    data_retrieved.append(data)

clear_output(wait=True)
print("Done")

In [None]:
dataset_retrieved = pd.DataFrame(data_retrieved)
dataset_retrieved.to_csv(RESULT_CSV_PATH, index=False)