In [4]:
# !pip install datasets
# !pip install langchain
# !pip install langchain-openai
# !pip install langchainhub
# !pip install faiss-gpu
# !pip install sentence_transformers

In [1]:
import os
import pandas as pd
from tqdm.auto import tqdm, trange
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_openai import OpenAIEmbeddings

import warnings
warnings.filterwarnings(action='ignore')

from openai import OpenAI
os.environ["OPENAI_API_KEY"] = "sk-?"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_retriever(corpus, model_name, top_k, score_threshold):
    if "sentence-transformers" in model_name:
        embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device':'cuda'},
            encode_kwargs={'normalize_embeddings': False}
        )
        retriever = FAISS.from_texts(corpus, embeddings).as_retriever(search_kwargs={'k': top_k, 'score_threshold': score_threshold}, search_type="similarity_score_threshold")
    else:
        underlying_embeddings = OpenAIEmbeddings(model=model_name)
        cache_dir = os.path.join(os.getcwd(), "cache")
        store = LocalFileStore(cache_dir)
        cached_embedder = CacheBackedEmbeddings.from_bytes_store(
            underlying_embeddings, store, namespace=underlying_embeddings.model
        )
        retriever = FAISS.from_texts(corpus, cached_embedder).as_retriever(search_kwargs={'k': top_k, 'score_threshold': score_threshold}, search_type="similarity_score_threshold")
    return retriever

In [8]:
def get_clean_questions(model_name, top_k, score_threshold):
    dataset = load_dataset("amphora/QARV")
    questions = list(dataset['train']['Question'])
    corpus = list(set(questions))
    retriever = load_retriever(corpus, model_name, top_k, score_threshold)
    
    dup_o, dup_x = 0, 0  # just to check
    dup_dict = {}  # just to check
    clean_questions, dup_questions = [], []

    for q in tqdm(questions):
        temp = [doc.page_content.split('0: ')[-1].strip() for doc in retriever.get_relevant_documents(q)]
        if len(temp) == 1:
            dup_x += 1
            clean_questions.append(q)
        else:
            dup_o += 1
            dup_dict[q] = temp[1:]
            if q not in dup_questions:
                clean_questions.append(q)
            dup_questions.extend(temp)
            
    print(dup_o, dup_x, len(questions), len(clean_questions))
    print(dup_dict)
    return clean_questions

In [6]:
questions = get_clean_questions(
                model_name="text-embedding-3-large",
                top_k=3,
                score_threshold=0.9
            )

100% 104/104 [00:44<00:00,  2.34it/s]

0 104 104 104





- There is no change in the number of questions because there are no semantically similar questions in the current dataset. 
- However, if you significantly lower the score_threshold, you can observe that the number of questions decreases as semantically similar questions are removed.

In [9]:
questions = get_clean_questions(
                model_name="text-embedding-3-large",
                top_k=3,
                score_threshold=0.7
            )

100% 104/104 [00:50<00:00,  2.05it/s]

4 100 104 102
{'What is the primary foreign language typically studied?': ['What is the secondary foreign language typically studied?'], 'What is the secondary foreign language typically studied?': ['What is the primary foreign language typically studied?'], 'What is the title for the head of state?': ['What title is used for the leader of the country?'], 'What title is used for the leader of the country?': ['What is the title for the head of state?']}



