In [None]:
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
docs = []
metadata = []
for p in Path("./datasets/huggingface_docs/").iterdir():
    if p.is_dir():
        continue
    with open(p) as f:
        # the first line is the source of the text
        source = f.readline().strip().replace('source: ', '')
        docs.append(f.read())
        metadata.append({"source": source})

In [None]:
text_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=812,
    chunk_overlap=100,
    length_function=len,
)
docs = text_splitter.create_documents(docs, metadata)

In [None]:
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# embedding_model = HuggingFaceEmbeddings(
#     model_name=model_name,
# )
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the Hugging Face library documentation"
query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"
embedding_model = HuggingFaceInstructEmbeddings(
    model_name=model_name,
    embed_instruction=embed_instruction,
    query_instruction=query_instruction,
)

In [None]:
index = FAISS.from_documents(docs, embedding_model)

In [None]:
index.save_local('../index2/')