### База данных - запустить контейнер с базой

Исходник: https://github.com/qdrant/qdrant/blob/master/docs/QUICK_START.md

Use latest pre-built image from DockerHub

```docker pull qdrant/qdrant```

Run it with default configuration:

```docker run -p 6333:6333 qdrant/qdrant```

После этого кидаем в нее запросы

In [2]:
from omegaconf import OmegaConf

# путь к конфигу проекта
path = "/Users/maxwell/cursor/RAGCore/configs/deployment_config.yaml"
cfg = OmegaConf.load(path)

In [3]:
from src.core.db.vector_database import VectorDBClient

db = VectorDBClient(cfg)

In [4]:
# список коллекций
db.get_collections()

{'result': {'collections': []}, 'status': 'ok', 'time': 8.636e-06}

In [5]:
# создание коллекции
db.create_collection(cfg['database']['collection_name'])

{'result': True, 'status': 'ok', 'time': 0.090262829}

In [6]:
# список коллекций
db.get_collections()

{'result': {'collections': [{'name': 'github_code_chunks'}]},
 'status': 'ok',
 'time': 1.7536e-05}

In [7]:
# информация о коллекции
db.get_collection(cfg['database']['collection_name'])

{'result': {'status': 'green',
  'optimizer_status': 'ok',
  'indexed_vectors_count': 0,
  'points_count': 0,
  'segments_count': 4,
  'config': {'params': {'vectors': {'size': 1536, 'distance': 'Cosine'},
    'shard_number': 1,
    'replication_factor': 1,
    'write_consistency_factor': 1,
    'on_disk_payload': True},
   'hnsw_config': {'m': 16,
    'ef_construct': 100,
    'full_scan_threshold': 10000,
    'max_indexing_threads': 0,
    'on_disk': False},
   'optimizer_config': {'deleted_threshold': 0.2,
    'vacuum_min_vector_number': 1000,
    'default_segment_number': 0,
    'max_segment_size': None,
    'memmap_threshold': None,
    'indexing_threshold': 10000,
    'flush_interval_sec': 5,
    'max_optimization_threads': None},
   'wal_config': {'wal_capacity_mb': 32,
    'wal_segments_ahead': 0,
    'wal_retain_closed': 1},
   'quantization_config': None},
  'payload_schema': {}},
 'status': 'ok',
 'time': 0.000715613}

### Модель эмбеддингов

In [8]:
from src.core.embedder.embedder import EmbeddingModel

embedding_model = EmbeddingModel(cfg)

In [9]:
# загрузка тестовых данных

import json

with open("/Users/maxwell/cursor/RAGCore/packages/repochunker/src/repochunker/chunks.json", "r") as f:
    chunks = json.load(f)

print(chunks[0]['content'])
print(chunks[0])

from pydantic import BaseModel, Field


class ChunkerConfig(BaseModel):
    language: str
    max_chunk_size: int
    chunk_overlap: int
    chunk_expansion: bool
    metadata_template: str = "default"

    extensions: list[str] = Field(default_factory=list)

class Metadata(BaseModel):
    filepath: str
    chunk_size: int
    line_count: int
    start_line_no: int
    end_line_no: int
    node_count: int

class Chunk(BaseModel):
    content: str
    metadata: Metadata

{'content': 'from pydantic import BaseModel, Field\n\n\nclass ChunkerConfig(BaseModel):\n    language: str\n    max_chunk_size: int\n    chunk_overlap: int\n    chunk_expansion: bool\n    metadata_template: str = "default"\n\n    extensions: list[str] = Field(default_factory=list)\n\nclass Metadata(BaseModel):\n    filepath: str\n    chunk_size: int\n    line_count: int\n    start_line_no: int\n    end_line_no: int\n    node_count: int\n\nclass Chunk(BaseModel):\n    content: str\n    metadata: Metadata\n', 'metadata': 

In [10]:
from src.core.schemas import Chunk, IndexRequest, IndexConfig, MetaRequest

# Перевод чанков в нужный формат
test_chunks = [Chunk(content=chunk['content'], metadata=chunk['metadata']) for chunk in chunks[:2]]

In [11]:
import uuid

# Векторизация чанков, мокаем часть входных данных
req_id = uuid.uuid4()

index_request = IndexRequest(
    repo_url='https://github.com/openai/openai-python',
    meta=MetaRequest(request_id=req_id)
)

vectors_data = await embedding_model.vectorize(test_chunks, index_request, None)

In [13]:
vectors_data

[{'id': '78fb281c-3c9a-49af-8fa8-68997cbc193c',
  'vector': [0.01815818,
   0.02105339,
   -0.00293861,
   -0.03056077,
   0.02057442,
   -0.02467942,
   0.01812282,
   -0.0125856,
   0.00099903,
   0.02842398,
   -0.04289309,
   0.03367573,
   -0.00470511,
   0.06091832,
   -0.00713084,
   -0.00385877,
   -0.01978357,
   -0.03127203,
   0.00962215,
   0.0700781,
   0.06040158,
   0.01205567,
   -0.0095287,
   -0.0232878,
   0.01776158,
   0.03311696,
   0.01805385,
   0.08909988,
   -0.00322376,
   0.00104461,
   0.02271637,
   -0.00458138,
   -0.01013929,
   0.00929134,
   -0.01315699,
   0.00503448,
   -0.0319005,
   0.02028283,
   0.02373008,
   -0.09271255,
   -0.03085542,
   -0.01961322,
   -0.03074773,
   -0.02838016,
   0.0251779,
   0.02639892,
   -0.00190654,
   0.04874963,
   -0.01801892,
   -0.02434382,
   0.02425104,
   0.00761469,
   0.03502927,
   0.05025203,
   -0.00995194,
   -0.00380208,
   -0.02478834,
   -0.01785796,
   0.00909733,
   0.02297528,
   0.01483181,
   0

In [14]:
# добавление векторов в базу
response = db.add_vectors(cfg['database']['collection_name'], vectors_data)
response

{'result': {'operation_id': 1, 'status': 'completed'},
 'status': 'ok',
 'time': 0.003821621}

In [15]:
# векторизация запроса
search_vector = embedding_model.embed([test_chunks[0].content])[0]
search_vector

[0.01814954,
 0.0211408,
 -0.00299843,
 -0.0307249,
 0.02071762,
 -0.02468546,
 0.01798987,
 -0.01253637,
 0.00087511,
 0.02838266,
 -0.04293441,
 0.03387573,
 -0.00470213,
 0.06099875,
 -0.0070663,
 -0.00407763,
 -0.01990648,
 -0.03117101,
 0.00979667,
 0.07010167,
 0.0602043,
 0.01200881,
 -0.0096089,
 -0.02328916,
 0.01791458,
 0.03332367,
 0.01780263,
 0.08900617,
 -0.00302816,
 0.00107758,
 0.02246584,
 -0.00454444,
 -0.01011774,
 0.00933816,
 -0.01302724,
 0.00499837,
 -0.03194283,
 0.02028839,
 0.02381883,
 -0.09264617,
 -0.03099883,
 -0.01956094,
 -0.03070908,
 -0.02820843,
 0.02520495,
 0.02637664,
 -0.00193924,
 0.04869017,
 -0.01816582,
 -0.02412806,
 0.02420895,
 0.00767145,
 0.03497426,
 0.050176,
 -0.00986968,
 -0.00385309,
 -0.02488321,
 -0.01781294,
 0.0091545,
 0.02298568,
 0.01499787,
 0.07194573,
 -0.0176538,
 0.00308909,
 -0.00658169,
 -0.02826307,
 -0.00548154,
 -0.03176025,
 0.01506384,
 0.0084839,
 0.02423443,
 0.01649438,
 -0.02314906,
 0.02013927,
 0.01534245,


In [16]:
# поиск по вектору в коллекции
response = db.search_by_vector(cfg['database']['collection_name'], search_vector)
response

{'result': [{'id': '78fb281c-3c9a-49af-8fa8-68997cbc193c',
   'version': 1,
   'score': 0.9999828,
   'payload': {'chunk_id': '78fb281c-3c9a-49af-8fa8-68997cbc193c',
    'filepath': '',
    'chunk_size': 370,
    'line_count': 24,
    'start_line_no': 0,
    'end_line_no': 23,
    'node_count': 1,
    'language': None,
    'repo_url': 'https://github.com/openai/openai-python',
    'request_id': 'facd1819-37ea-46b1-9c48-5041ae14deef',
    'content': 'from pydantic import BaseModel, Field\n\n\nclass ChunkerConfig(BaseModel):\n    language: str\n    max_chunk_size: int\n    chunk_overlap: int\n    chunk_expansion: bool\n    metadata_template: str = "default"\n\n    extensions: list[str] = Field(default_factory=list)\n\nclass Metadata(BaseModel):\n    filepath: str\n    chunk_size: int\n    line_count: int\n    start_line_no: int\n    end_line_no: int\n    node_count: int\n\nclass Chunk(BaseModel):\n    content: str\n    metadata: Metadata\n'}},
  {'id': 'a2daa9ef-055f-4400-a9d9-973827fce2