In [1]:
import torch
import gc
import faiss
import json
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

Running on: cuda


In [2]:
# === КОНФИГ ===
INDEX_PATH = "../data/processed/faiss_index/index.faiss"
CHUNKS_PATH = "../data/processed/chunks.jsonl"

EMBED_MODEL = "Qwen/Qwen3-Embedding-0.6B"
LLM_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

TOP_K = 10

# ЗАПРОС ПОЛЬЗОВАТЕЛЯ
QUERY = "Which scientific papers explore graphs within the biomedical domain?"

In [3]:
embed_model = SentenceTransformer(EMBED_MODEL, device=device) 
index = faiss.read_index(INDEX_PATH)

In [4]:
print(f"--- Searching for: '{QUERY}' ---")

query_vector = embed_model.encode(
    [QUERY], 
    prompt_name="query", 
    convert_to_numpy=True,
    normalize_embeddings=True
)

distances, indices = index.search(query_vector.astype('float32'), TOP_K)

print(f"Top-{TOP_K} Indices: {indices[0]}")
print(f"Distances: {[round(x, 3) for x in distances[0]]}")

--- Searching for: 'Which scientific papers explore graphs within the biomedical domain?' ---
Top-10 Indices: [ 988 1362 1376 2084 2118 1374 3725 3673 1375 1364]
Distances: [0.552, 0.544, 0.535, 0.533, 0.53, 0.527, 0.525, 0.524, 0.523, 0.508]


In [5]:
# TODO: Решить проблему с переполнением RAM
print("--- Fetching Chunks ---")

with open(CHUNKS_PATH, 'r', encoding='utf-8') as f:
    all_chunks = [json.loads(line) for line in f]

top_indices = indices[0]
top_chunks = [all_chunks[idx] for idx in top_indices if idx < len(all_chunks)]

print(f"Retrieved {len(top_chunks)} chunks.")

--- Fetching Chunks ---
Retrieved 10 chunks.


In [6]:
def format_chunk_for_llm(chunk):
    return (
        f"--- DOCUMENT ID: {chunk['id']} ---\n"
        f"TITLE: {chunk['metadata'].get('Title', 'N/A')}\n"
        f"CONTENT:\n{chunk['text'].strip()}\n"
    )

context_block = "\n\n".join([format_chunk_for_llm(c) for c in top_chunks])

print(f"Context Length (chars): {len(context_block)}")
# print(context_block[:500] + "...") # Для дебага

Context Length (chars): 12712


In [7]:
# Освобождаем VRAM
del embed_model
del query_vector

gc.collect()
torch.cuda.empty_cache()

print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Memory allocated: 0.01 GB
Memory reserved: 0.02 GB


In [8]:
print(f"--- Loading LLM: {LLM_MODEL} ---")

model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
print("LLM Loaded.")

--- Loading LLM: Qwen/Qwen2.5-1.5B-Instruct ---
LLM Loaded.


In [9]:
system_instruction = "You are a Professional NLP Research Assistant."

user_message = f"""### CONTEXT:
{context_block}

===
IMPORTANT RULES:
1. Answer ONLY using the context above.
2. If no answer, say strictly: "I don't have enough information."
3. Write a paper ID.
===

### QUESTION:
{QUERY}"""

messages = [
    {"role": "system", "content": system_instruction},
    {"role": "user", "content": user_message}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [10]:
print("--- Generating Response ---")

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=1024,
    temperature=0.1
)

# Обрезаем входные токены, чтобы оставить только ответ
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("\n=== RESPONSE ===\n")
print(response)

--- Generating Response ---

=== RESPONSE ===

Based on the provided context, several scientific papers explore graphs within the biomedical domain:

1. **Multimodal Contrastive Representation Learning in Augmented Biomedical Knowledge Graphs** - This paper introduces a multimodal approach that combines embeddings from specialized Language Models with Graph Contrastive Learning to enhance biomedical knowledge graphs.

2. **Attending To Syntactic Information In Biomedical Event Extraction Via Graph Neural Networks** - This paper discusses methods for extracting events from biomedical text using graph neural networks, focusing on leveraging syntactic information embedded in dependency parsing graphs.

3. **Towards Omni-RAG: Comprehensive Retrieval-Augmented Generation for Large Language Models in Medical Applications** - While primarily focused on retrieval-augmented generation, this paper mentions the use of structured graphs in their framework, suggesting interest in integrating graphs