## 1. Установка зависимостей (можно пропустить, если в проекте установлены зависимости из requirements.txt)

In [1]:
%%capture
!pip install transformers==4.47.1
!pip install datasets==3.2.0
!pip install bitsandbytes==0.45.1
!pip install qdrant-client==1.13.2
!pip install sentence-transformers==3.3.1

## 2. Импорты

In [2]:
import ctypes
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GenerationConfig,
    BitsAndBytesConfig,
)
from sentence_transformers import SentenceTransformer

from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

## 3. Конфиг

In [3]:
class Config:
    device = "cuda" if torch.cuda.is_available() else "cpu"

    train = True
    num_labels = 2
    seed = 42
    test_size = 0.2

    # Classifier
    classifier = "distilbert/distilbert-base-uncased"
    classification_data = "databricks/databricks-dolly-15k"
    finetuned_classifier_path = "/query_classifier"
    keywords = [
        "latest",
        "update",
        "news",
        "current",
        "recent",
        "ongoing",
        "actual",
        "date",
        "new",
        "contemporary",
        "modern",
        "present",
        "2024",
        "2025",
    ]

    # Classifier training hyperparameters
    num_train_epochs = 1
    per_device_train_batch_size = 32
    per_device_eval_batch_size = 32
    learning_rate = 5e-5

    # Vectorization model
    sentence_transformer = "intfloat/multilingual-e5-large"

    # LLM
    llm = "microsoft/Phi-3.5-mini-instruct"
    max_length = 512
    top_p = 0.95
    temperature = 0.7

    # Qdrant
    metric = Distance.COSINE  # 'COSINE', 'DOT', 'EUCLID', 'MANHATTAN'
    num_search_results = 1

## 4. Подготовка данных для классификатора

В качестве данных для классификатора возьмем датасет Dolly15k (раздел **instruction**). По нему составим разметку на основе эвристики: если есть ключевые слова, то класс будет положительным (нужен RAG). Желательно взять побольше слов, чтобы снизить вероятность ошибки второго рода (не выбрать поиск, когда он нужен). Тем не менее, подход плох тем, что нельзя учесть все случаи, а также изменчивостью языка во времени. Более интересным, но долгим решением является разметка с помощью LLM, используемой в инференсе.

В качестве классификатора выберем энкодерный DistilBERT, потому что он легковесный. В зависимости от требований к скорости обучения и инференса можно пробовать модели вплоть до **microsoft/deberta-v3-large** и **answerdotai/ModernBERT-large**

In [4]:
def load_and_prepare_data():
    """ Loading and preparing the dataset """
    dataset = load_dataset(Config.classification_data, split="train")
    tokenizer = AutoTokenizer.from_pretrained(Config.classifier)

    def prepare_data(data):
        """ Dropping duplicates, NA & cleaning"""
        df = data.to_pandas()
        df.drop_duplicates(subset=["instruction"], inplace=True)
        df["instruction"] = df["instruction"].apply(lambda x: x.strip())
        df["instruction"].dropna(inplace=True)
        df = df.reset_index()
        return Dataset.from_pandas(df)

    dataset = prepare_data(dataset)

    def label_query(example):
        """Labeling: 1 - search is required, 0 - search is not required"""
        keywords = Config.keywords
        example["search_needed"] = int(
            any(word in example["instruction"].lower() for word in keywords)
        )
        return example

    dataset = dataset.map(label_query)

    df = pd.DataFrame(dataset)
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df["instruction"].tolist(),
        df["search_needed"].tolist(),
        test_size=Config.test_size,
        stratify=df["search_needed"],
        random_state=Config.seed,
    )

    train_dataset = DatasetDict(
        {
            "train": Dataset.from_pandas(
                pd.DataFrame({"text": train_texts, "labels": train_labels})
            ),
            "test": Dataset.from_pandas(
                pd.DataFrame({"text": test_texts, "labels": test_labels})
            ),
        }
    )

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
    return tokenized_datasets

In [None]:
datasets = load_and_prepare_data()

In [6]:
print(datasets['train']['labels'].count(0))
print(datasets['train']['labels'].count(1))

11374
449


## 5. Обучение классификатора

Параметры выбраны произвольно так, чтобы не было OOM Error на Т4.



In [7]:
def train_classifier():
    """ Finetuning the classifier """
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.classifier, num_labels=Config.num_labels
    )
    tokenizer = AutoTokenizer.from_pretrained(Config.classifier)
    training_args = TrainingArguments(
        report_to="none",
        output_dir="./results",
        eval_strategy="steps",
        save_strategy="epoch",
        num_train_epochs=Config.num_train_epochs,
        learning_rate=Config.learning_rate,
        per_device_train_batch_size=Config.per_device_train_batch_size,
        per_device_eval_batch_size=Config.per_device_eval_batch_size,
        weight_decay=0.01,
        logging_steps=100,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["test"],
    )
    trainer.train()

    trainer.save_model(Config.finetuned_classifier_path)
    tokenizer.save_pretrained(Config.finetuned_classifier_path)

In [8]:
if Config.train:
    train_classifier()
else:
    try:
        model = AutoModelForSequenceClassification.from_pretrained(
            Config.finetuned_classifier_path
        )
        tokenizer = AutoTokenizer.from_pretrained(Config.finetuned_classifier_path)
    except Exception as e:
        print(f"Error: {e}")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
100,0.1553,0.074692
200,0.0565,0.048332
300,0.0398,0.02985


In [9]:
torch.cuda.empty_cache()
libc = ctypes.CDLL("libc.so.6")
libc.malloc_trim(0)

1

## 6. Инициализация Qdrant с одной коллекцией (baseline)

При появлении новых коллекций класс QdrantHelper необходимо модифицировать так, чтобы название коллекции передавалось в методы аргументом.

В данном MVP не рассматривалась работа с текстами, что также является важным компонентом.

In [None]:
vector_model = SentenceTransformer(Config.sentence_transformer, cache_folder='/models')

In [11]:
texts = [
        'aaa',
        'New python dunder method for API calls: __api__'
         ]

In [12]:
class QdrantHelper:
    def __init__(
        self,
        collection_name="data",
        vector_size=vector_model.get_sentence_embedding_dimension(),
        metric=Distance.COSINE,
    ):
        self.client = QdrantClient(path="/qdrant")
        self.collection_name = collection_name
        self.vector_size = vector_size
        self.metric = metric

    def recreate_collection(self) -> None:
        """Clear all data and recreate collection"""
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config=VectorParams(size=self.vector_size, distance=self.metric),
        )
        print(f"Collection '{self.collection_name}' is recreated!")

    def upload_collection(self, texts: list) -> None:
        """Adding text data and vectors to collection"""
        embeddings = vector_model.encode(texts)
        vectors = embeddings.tolist()

        points = [
            PointStruct(id=i, vector=vectors[i], payload={"text": texts[i]})
            for i in range(len(texts))
        ]

        self.client.upsert(collection_name=self.collection_name, points=points)
        print(f"{len(texts)} objects are added to '{self.collection_name}' collection!")

    def search(self, query: str, limit: int = 5) -> list:
        """Nearest vectors search"""
        query_vector = vector_model.encode(query).tolist()
        results = self.client.search(
            collection_name=self.collection_name, query_vector=query_vector, limit=limit
        )
        return [res.payload["text"] for res in results]

In [13]:
qdrant = QdrantHelper()

In [14]:
qdrant.recreate_collection()

Collection 'data' is recreated!


In [15]:
qdrant.upload_collection(texts)

2 objects are added to 'data' collection!


## 7. Инференс

Для инференса языковой модели <15B параметров можно использовать transformers+bitsandbytes, vllm+GPTQ или unsloth.

Загрузка моделей классификации и генерации вынесена в отдельный класс для удобства

In [16]:
class ModelLoader:
    def __init__(self):
        # Classifier
        self.classifier_tokenizer = AutoTokenizer.from_pretrained(
            Config.finetuned_classifier_path
        )
        self.classifier_model = AutoModelForSequenceClassification.from_pretrained(
            Config.finetuned_classifier_path
        ).to(Config.device)

        # LLM
        self.tokenizer = AutoTokenizer.from_pretrained(Config.llm)
        self.model = AutoModelForCausalLM.from_pretrained(
            Config.llm,
            cache_dir="/models",
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
            ),
            device_map="auto",
        ).eval()

    def _classify_query(self, query: str) -> int:
        """Query classification"""
        inputs = self.classifier_tokenizer(
            query, return_tensors="pt", padding=True, truncation=True
        ).to(Config.device)
        with torch.no_grad():
            logits = self.classifier_model(**inputs).logits
        return torch.argmax(logits).item()

    def generate_response(self, query: str):
        """Generate response with or without RAG"""
        use_rag = self._classify_query(query) == 1

        if use_rag:
            print("Using RAG!")
            search_results = qdrant.search(query, limit=Config.num_search_results)
            context = " ".join(search_results)
            print(f"Query: {query}")
            print(f"Context: {context}")
            user_prompt = f"Context: {context}\n\nQuery: {query}\nAnswer:"
        else:
            print("Processing the query directly!")
            print(f"Query: {query}")
            user_prompt = query

        system_prompt = """
                      You are a helpful assistant.

                      - If the user provides context, use ONLY the information in the context to answer the question.
                      - If the context is missing or insufficient, state clearly: "I don't have enough information to answer this question."
                      - Do NOT make up facts.
                      - If no context is given, rely on your general knowledge.
                      """

        prompt = model_loader.tokenizer.apply_chat_template(
            [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            tokenize=False,
            add_generation_prompt=True,
        )

        data = model_loader.tokenizer(
            prompt, return_tensors="pt", add_special_tokens=False
        ).to(Config.device)
        output_ids = model_loader.model.generate(**data, max_length=Config.max_length)[
            0
        ]
        return model_loader.tokenizer.decode(
            output_ids[len(data["input_ids"][0]) :], skip_special_tokens=True
        ).strip()

In [None]:
model_loader = ModelLoader()

In [18]:
query = "What features were added in the latest Python update?"

In [19]:
output = model_loader.generate_response(query)
output

Using RAG!
Query: What features were added in the latest Python update?
Context: New python dunder method for API calls: __api__


"I'm sorry, but the provided context only mentions a new dunder method `__api__` in Python, without any additional details about the latest Python update. Therefore, I don't have enough information to answer this question.\n\nIf you're looking for the latest Python updates, I recommend checking the official Python website or other reliable sources for the most recent information."