In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
import json

In [None]:
with open('../data/wiki_dump_test_without_temps.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


inputs = [item["content"] for item in data if "content" in item]

In [None]:
documents = [Document(page_content=text) for text in inputs]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory="./terraria_db")

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
docs = retriever.get_relevant_documents("Скелетрон прайм")
for doc in docs:
    print(doc.page_content)
    print("---------------------------------------------------------------------------------------------------------")