In [None]:
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
docs = []
metadata = []
for p in Path("./datasets/huggingface_docs/").iterdir():
    if not p.is_dir():
        with open(p) as f:
            # the first line is the source of the text
            source = f.readline().strip().replace('source: ', '')
            docs.append(f.read())
            metadata.append({"source": source})

print(f'number of documents: {len(docs)}')

In [None]:
text_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=1024,
    chunk_overlap=100,
    length_function=len,
)
docs = text_splitter.create_documents(docs, metadata)

In [None]:
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the Hugging Face library documentation"
query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"
embedding_model = HuggingFaceInstructEmbeddings(
    model_name=model_name,
    embed_instruction=embed_instruction,
    query_instruction=query_instruction,
)

In [None]:
index = FAISS.from_documents(docs, embedding_model)

In [None]:
index.save_local('../index_temp/')

In [None]:
index = FAISS.load_local("../index-xl/", embedding_model)
docs = index.similarity_search(query='how to create a pipeline object?', k=5)
docs[0].page_content
docs[0].metadata

In [None]:
for index, doc in enumerate(docs, start=1):
    print(f"\n{'='*100}\n")
    print(f"Document {index} of {len(docs)}")
    print("Page Content:")
    print(f"\n{'-'*100}\n")
    print(doc.page_content, '\n')
    print(doc.metadata)