In [81]:
import sys, subprocess
pkgs = [
    "sentence-transformers",
    "tqdm",
    "numpy",
    "scikit-learn",
    "ipywidgets",
    "wikipedia",
    "transformers",
    "torch"
]
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu"])
except Exception:
    pass


In [82]:
import os, json, re, hashlib
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

try:
    import faiss
    _HAS_FAISS = True
except Exception:
    faiss = None
    _HAS_FAISS = False

from sklearn.neighbors import NearestNeighbors

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
CHUNKS_FILE = DATA_DIR / "wiki_chunks.jsonl"
EMB_FILE = DATA_DIR / "embeddings.npy"
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
EMB_BATCH = 64
TOP_K = 5
PASSAGE_CHAR_TRIM = 1200

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", None)
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
USE_OPENAI = bool(OPENAI_API_KEY)
if USE_OPENAI:
    import openai
    openai.api_key = OPENAI_API_KEY


In [83]:
def load_chunks(path=CHUNKS_FILE):
    if not path.exists():
        raise FileNotFoundError(f"Файл {path} не найден. Сначала создайте wiki_chunks.jsonl.")
    chunks = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

def chunk_preview(text, char_limit=PASSAGE_CHAR_TRIM):
    if len(text) <= char_limit:
        return text
    return text[:char_limit].rsplit(" ", 1)[0] + "…"

chunks = load_chunks(CHUNKS_FILE)


In [84]:
def compute_embeddings_if_missing(chunks, emb_file=EMB_FILE, model_name=MODEL_NAME):
    if emb_file.exists():
        return np.load(emb_file)
    model = SentenceTransformer(model_name)
    texts = [c["text"] for c in chunks]
    parts = []
    for i in tqdm(range(0, len(texts), EMB_BATCH)):
        batch = texts[i:i+EMB_BATCH]
        emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        parts.append(emb)
    embeddings = np.vstack(parts).astype("float32")
    np.save(emb_file, embeddings)
    return embeddings

embeddings = compute_embeddings_if_missing(chunks)


In [85]:
INDEX = None
NN_MODEL = None
if _HAS_FAISS:
    try:
        xb = embeddings.copy()
        faiss.normalize_L2(xb)
        INDEX = faiss.IndexFlatIP(xb.shape[1])
        INDEX.add(xb)
    except Exception:
        _HAS_FAISS = False
if not _HAS_FAISS:
    NN_MODEL = NearestNeighbors(n_neighbors=TOP_K, metric="cosine", algorithm="brute")
    NN_MODEL.fit(embeddings)


In [86]:
try:
    reranker = CrossEncoder(RERANKER_MODEL)
except Exception:
    reranker = None

def rerank_candidates(query, candidates, topk=5):
    if reranker is None or not candidates:
        return candidates[:topk]
    pairs = [[query, c["text"]] for c in candidates]
    scores = reranker.predict(pairs)
    for c, s in zip(candidates, scores):
        c["rerank_score"] = float(s)
    sorted_c = sorted(candidates, key=lambda x: x.get("rerank_score", 0.0), reverse=True)
    return sorted_c[:topk]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [87]:
import numpy as np

def mmr_select(query_emb, doc_embs, k=5, lambda_param=0.5):
    N = doc_embs.shape[0]
    if N == 0:
        return []
    selected = []
    candidates = list(range(N))
    sim_to_query = (doc_embs @ query_emb).tolist()
    while len(selected) < min(k, N) and candidates:
        mmr_scores = []
        for idx in candidates:
            sim_q = sim_to_query[idx]
            sim_to_sel = 0.0
            if selected:
                sims = [float(doc_embs[idx] @ doc_embs[s]) for s in selected]
                sim_to_sel = max(sims)
            score = lambda_param * sim_q - (1 - lambda_param) * sim_to_sel
            mmr_scores.append((score, idx))
        best = max(mmr_scores, key=lambda x: x[0])[1]
        selected.append(best)
        candidates.remove(best)
    return selected


In [88]:
def build_prompt_with_budget_ru(question, retrieved, token_budget=3000, approx_chars_per_token=4):
    budget_chars = token_budget * approx_chars_per_token
    parts = []
    used = 0
    for i, r in enumerate(retrieved, 1):
        txt = r["text"]
        remaining = max(0, budget_chars - used - 200)
        if remaining <= 0:
            break
        take = min(len(txt), remaining)
        snippet = txt[:take].rsplit(" ",1)[0]
        parts.append({
            "index": i,
            "title": r.get("title",""),
            "url": r.get("url", r.get("source_url","")),
            "text": snippet
        })
        used += len(snippet)
    source_blocks = []
    for pi in parts:
        source_blocks.append(f"[ИСТОЧНИК {pi['index']}]\nЗаголовок: {pi['title']}\nСсылка: {pi['url']}\nТекст: {pi['text']}\n")
    context = "\n\n".join(source_blocks)
    system = (
        "Вы — помощник. Отвечайте ТОЛЬКО на основании приведённых источников. "
        "Если ответа нет в источниках — скажите, что не знаете. Ссылайтесь на источники маркерами [ИСТОЧНИК n]."
    )
    user = f"Вопрос: {question}\n\nИспользуйте только следующие источники:\n\n{context}\n\nОтветьте кратко и укажите источники."
    return system, user


In [89]:
from sentence_transformers import SentenceTransformer as _ST
def search_chunks_raw(query, topk=50):
    model_local = _ST(MODEL_NAME)
    q_emb = model_local.encode([query], convert_to_numpy=True).astype("float32")
    if _HAS_FAISS and INDEX is not None:
        faiss.normalize_L2(q_emb)
        D, I = INDEX.search(q_emb, topk)
        scores = D[0].tolist()
        idxs = I[0].tolist()
    else:
        dists, idxs = NN_MODEL.kneighbors(q_emb, n_neighbors=min(topk, embeddings.shape[0]), return_distance=True)
        scores = (1.0 - dists[0]).tolist()
        idxs = idxs[0].tolist()
    candidates = []
    for score, idx in zip(scores, idxs):
        c = chunks[int(idx)]
        cand = {
            "score": float(score),
            "title": c.get("title",""),
            "chunk_id": c.get("chunk_id",-1),
            "text": c.get("text",""),
            "url": c.get("source_url","")
        }
        candidates.append(cand)
    return candidates

def rag_answer_ru(question, retrieve_k=50, rerank_k=5, mmr_k=5, use_mmr=False):
    candidates = search_chunks_raw(question, topk=retrieve_k)
    if len(candidates) == 0:
        return {"question": question, "answer": "Ничего не найдено по запросу.", "sources": []}
    if reranker is not None:
        rerank_input = candidates[:min(len(candidates), 200)]
        reranked_scores = reranker.predict([[question, c["text"]] for c in rerank_input])
        for c, s in zip(rerank_input, reranked_scores):
            c["rerank_score"] = float(s)
        rerank_sorted = sorted(rerank_input, key=lambda x: x.get("rerank_score", 0.0), reverse=True)
    else:
        rerank_sorted = candidates
    if use_mmr and len(rerank_sorted) > 0:
        texts = [c["text"] for c in rerank_sorted]
        emb_model = _ST(MODEL_NAME)
        cand_embs = emb_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
        norms = np.linalg.norm(cand_embs, axis=1, keepdims=True)
        cand_embs = cand_embs / (norms + 1e-9)
        q_emb = emb_model.encode([question], convert_to_numpy=True)[0]
        q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-9)
        sel_idx = mmr_select(q_emb, cand_embs, k=min(mmr_k, len(rerank_sorted)))
        selected = [rerank_sorted[i] for i in sel_idx]
    else:
        selected = rerank_sorted[:rerank_k]
    system_prompt, user_prompt = build_prompt_with_budget_ru(question, selected)
    if USE_OPENAI:
        messages = [{"role":"system","content":system_prompt}, {"role":"user","content":user_prompt}]
        resp = openai.ChatCompletion.create(model=OPENAI_MODEL, messages=messages, max_tokens=400, temperature=0.0)
        answer = resp["choices"][0]["message"]["content"]
    else:
        answer = "Локальная генерация не настроена. Ниже — найденные источники и отрывки:\n\n"
        for i, r in enumerate(selected, 1):
            answer += f"{i}. {r.get('title','')} — {r.get('url','')}\n{r.get('text','')[:800]}\n\n"
    return {"question": question, "answer": answer, "sources": selected}


In [90]:
res = rag_answer_ru("что такое матемтика?", retrieve_k=50, rerank_k=10, mmr_k=5, use_mmr=False)
print("Вопрос:", res["question"])
print("\nСгенерированный ответ:\n")
print(res["answer"])
print("\nИсточники:\n")
for i, s in enumerate(res["sources"], 1):
    score = s.get("rerank_score", s.get("score", 0.0))
    print(f"{i}. [{score:.4f}] {s.get('title','')} - {s.get('url','')}")


Вопрос: что такое матемтика?

Сгенерированный ответ:

Локальная генерация не настроена. Ниже — найденные источники и отрывки:

1. Квантовая механика — https://ru.wikipedia.org/wiki/Квантовая_механика
Ква́нтовая (волнова́я) меха́ника — фундаментальная физическая теория, которая описывает природу в масштабе атомов и субатомных частиц. Она лежит в основании всей квантовой физики, включая квантовую химию, квантовую теорию поля, квантовую технологию и квантовую информатику. Классическая физика, совокупность теорий, существовавших до появления квантовой механики, описывает многие аспекты природы в обычном масштабе, но недостаточна для их количественного описания в малых (атомных и субатомных) масштабах. Большинство теорий классической физики можно вывести из квантовой механики как приближения, справедливые в больших (макроскопических) масштабах. Квантовая механика отличается от классической физики тем, что энергия, импульс, угловой момент и другие величины связанного состояния системы не мог