In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/wiki_dump_test_without_temps.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


inputs = [item["content"] for item in data if "content" in item]

In [3]:
documents = [Document(page_content=text) for text in inputs]

text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=3000, chunk_overlap=1000)
chunks = text_splitter.split_documents(documents)

In [4]:
len(chunks)

4934

In [5]:
# не запускать если все плохо с гпу, либо нет cuda
# если нет cuda качать так: 
# pip uninstall torch torchvision torchaudio -y
# pip install torch --index-url https://download.pytorch.org/whl/cu118 --no-cache-dir --no-deps
#
# проверить
# import torch
# print(torch.__version__)
# print(torch.cuda.is_available())


embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={"device": "cuda"}
)

In [1]:
# если уже есть база, то загрузить ее так:
# vectorstore = Chroma(persist_directory="./terraria_db", embedding_function=embeddings)

In [6]:
vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory="./terraria_db")
retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

In [7]:
query = "как скрафтить зенит рецепт"

results = vectorstore.similarity_search_with_score(query, k=5)

for i, (doc, score) in enumerate(results):
    print(f"Document {i+1}: similarity = {score:.4f}")
    print(doc.page_content)
    print("---------------------------------------------------------------------------------------------------------")

Document 1: similarity = 0.3341
(Рецепт: result = Crystal Block, resultid = 3234, amount = 5, station = Adamantite Forge, Stone Block, 5, Crystal Shard, 1)
(Рецепт: version =, result = Adamantite Bar, resultid = 391, amount = 1, station = Adamantite Forge, Adamantite Ore, 4)
(Рецепт: version =, result = Titanium Bar, resultid = 1198, amount = 1, station = Adamantite Forge, Titanium Ore, 4)
(Рецепт: version =, result = Chlorophyte Bar, resultid = 1006, amount = 1, station = Adamantite Forge, Chlorophyte Ore, 5)
(Рецепт: result = Spectre Bar, resultid = 3261, amount = 2, station = Adamantite Forge, Chlorophyte Bar, 2, Ectoplasm, 1)
---------------------------------------------------------------------------------------------------------
Document 2: similarity = 0.3500
(Рецепт: where=resultid in (), expectedrows=192, stationgrouping=no, header-result=Стена, header-ingredients=Рецепт, header-station=Рабочее место, resulttemplate=:/resultcell)
------------------------------------------------