In [6]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
from sqlalchemy import text
import pandas as pd
import os

load_dotenv()

DB_USER = os.getenv('DB_USER')
DB_PASS = os.getenv('DB_PASS')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
PATH_TO_CERT = os.getenv('PATH_TO_CERT')

connection_str = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

engine = create_engine(
    connection_str,
    connect_args={
        "sslmode": "verify-full",
        "sslrootcert": PATH_TO_CERT,
        "target_session_attrs": "read-write"
    }
)

In [11]:
create_table_query = """
drop table if exists posts;
"""

try:
    with engine.begin() as conn:
        conn.execute(text(create_table_query))
except Exception as e:
    print("–û—à–∏–±–∫–∞:", e)


In [4]:
with engine.connect() as conn:
    df = pd.read_sql('''
select * from posts
    ''', conn)

df

Unnamed: 0,id,text,created_at


In [73]:
def strip_edges_allow_punct(s: str):
    allowed_punct = set(".,!?;:-‚Äì‚Äî")  # –º–æ–∂–Ω–æ —Ä–∞—Å—à–∏—Ä—è—Ç—å

    # –õ–µ–≤—ã–π —É–∫–∞–∑–∞—Ç–µ–ª—å ‚Äî –ø–æ–∫–∞ –Ω–µ –±—É–∫–≤–∞/—Ü–∏—Ñ—Ä–∞
    left = 0
    while left < len(s) and not s[left].isalnum():
        left += 1

    # –ü—Ä–∞–≤—ã–π —É–∫–∞–∑–∞—Ç–µ–ª—å ‚Äî –ø–æ–∫–∞ –Ω–µ –±—É–∫–≤–∞/—Ü–∏—Ñ—Ä–∞/–ø—É–Ω–∫—Ç—É–∞—Ü–∏—è
    right = len(s) - 1
    while right >= 0 and not (s[right].isalnum() or s[right] in allowed_punct):
        right -= 1

    # –ï—Å–ª–∏ –≤—Å—ë –º—É—Å–æ—Ä
    if right < left:
        return ""

    return s[left:right+1]



def process_str(s):
    s = "\n".join(strip_edges_allow_punct(p) for p in s.split("\n") if p)
    
    for suf in [
        "–°–ª—É—à–∞—Ç—å –ø—Ä—è–º–æ–π —ç—Ñ–∏—Ä",
        "–ß–∏—Ç–∞—Ç—å –†–ë–ö –°—Ç–∏–ª—å –≤ Telegram",
        "–†–ë–ö Events, 18",
        "–ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è | –û–Ω–ª–∞–π–Ω-—Å–æ–º–µ–ª—å–µ",
        "–ß–∏—Ç–∞—Ç—å –†–ë–ö –≤ Telegram",
        "–°–ª–µ–¥–∏—Ç—å –∑–∞ –Ω–æ–≤–æ—Å—Ç—è–º–∏ –†–ë–ö –≤ Telegram",
        "–°–ª–µ–¥–∏—Ç—å –∑–∞ –Ω–æ–≤–æ—Å—Ç—è–º–∏ –†–ë–ö –≤ –ú–ê–•",
        "–î—Ä—É–≥–∏–µ –≤–∏–¥–µ–æ —ç—Ç–æ–≥–æ –¥–Ω—è ‚Äî –≤ —Ç–µ–ª–µ–≥—Ä–∞–º-–∫–∞–Ω–∞–ª–µ –†–ë–ö",
        "–†–ë–ö –≤ Telegram –∏ MAX",
        "–†–ë–ö –≤ Telegram | MAX",
        "–ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è –Ω–∞ ¬´–†–ë–ö –°–ø–æ—Ä—Ç",
        "–ö–∞—Ä—Ç–∏–Ω–∞ –¥–Ω—è ‚Äî –≤ —Ç–µ–ª–µ–≥—Ä–∞–º-–∫–∞–Ω–∞–ª–µ –†–ë–ö",
        "–°–∞–º—ã–µ –≤–∞–∂–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏ ‚Äî –≤ –∫–∞–Ω–∞–ª–µ –†–ë–ö –≤ –ú–ê–•",
        "–ë–æ–ª—å—à–µ –∏–Ω—Ñ–æ–≥—Ä–∞—Ñ–∏–∫–∏ ‚Äî –≤ —Ç–µ–ª–µ–≥—Ä–∞–º-–∫–∞–Ω–∞–ª–µ –†–ë–ö",
        "–ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è –Ω–∞ ¬´–°–∞–º —Ç—ã –∏–Ω–≤–µ—Å—Ç–æ—Ä!",
        "–ß–∏—Ç–∞—Ç—å –†–ë–ö –ù–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç—å –≤ Telegram"
    ]:
        s = s.removesuffix(suf).strip()

    parts = [p for p in s.split("\n") if p]
    
    prev_parts = [0] * 1000
    while len(prev_parts) != len(parts) and len(parts) != 0:
        prev_parts = parts
        if "–§–æ—Ç–æ:" in parts[-1] or "–î–∞–Ω–Ω—ã–µ:" in parts[-1]:
            parts = parts[:-1]
    
    return "\n".join(parts)

import pandas as pd

rbc = pd.read_csv("src/dataset/rbc/channel_rbc_news_posts.csv")
rbc["message_dt"] = pd.to_datetime(rbc["message_dt"]).dt.date
rbc = rbc[["message_id", "channel_id", "message_dt", "views", "content"]].astype({"views": int}).sample(len(rbc))
rbc["content"] = rbc["content"].apply(lambda x: process_str(x))
rbc = rbc[rbc["content"].apply(
        lambda x: not any(v in [p for p in x.split("\n") if p][-1] for v in ["–†–µ–∫–ª–∞–º–∞.", "–†–µ–∫–ª–∞–º–∞,"]) if x else False
    )]
rbc

Unnamed: 0,message_id,channel_id,message_dt,views,content
4498,130471,rbc_news,2025-09-12,139946,–ê–¥–≤–æ–∫–∞—Ç –±—ã–≤—à–µ–≥–æ –º—ç—Ä–∞ –°—Ç–∞–º–±—É–ª–∞ –≠–∫—Ä–µ–º–∞ –ò–º–∞–º–æ–≥–ª—É ...
1054,134025,rbc_news,2025-10-22,112469,–ì–æ—Å–¥—É–º–∞ –ø—Ä–∏–Ω—è–ª–∞ –≤ –ø–µ—Ä–≤–æ–º —á—Ç–µ–Ω–∏–∏ –ø—Ä–æ–µ–∫—Ç –±—é–¥–∂–µ—Ç–∞...
283,136520,rbc_news,2025-11-24,118238,–°–∏–ª—ã –ü–í–û –≤ –ø–µ—Ä–∏–æ–¥ —Å 14:00 –¥–æ 20:00 –º—Å–∫ —É–Ω–∏—á—Ç–æ–∂...
4406,124142,rbc_news,2025-07-15,122075,–í –ì–æ—Å–¥—É–º–µ –∏–∑-–∑–∞ –º–æ—â–Ω—ã—Ö –ª–∏–≤–Ω–µ–π –∑–∞—Ç–æ–ø–∏–ª–æ –∫—É—Ä–∏–ª–∫—É...
1378,135060,rbc_news,2025-11-05,114842,–ê–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∞—è –∞–∫—Ç—Ä–∏—Å–∞ –∏ –ø–æ—Å–æ–ª –¥–æ–±—Ä–æ–π –≤–æ–ª–∏ –Æ–ù–ò–°–ï...
...,...,...,...,...,...
1467,130799,rbc_news,2025-09-17,135139,"–ö–∏–Ω–æ–∫–æ–º–ø–∞–Ω–∏–∏ Walt Disney, Universal –∏ Warner B..."
4581,123624,rbc_news,2025-07-10,121628,–ì–ª–∞–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏ –∫ —É—Ç—Ä—É ‚Äî –Ω–∞ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª–µ –†–ë–ö
4281,130775,rbc_news,2025-09-16,120652,–ü—É—Ç–∏–Ω –ø—Ä–∏–º–µ—Ä–∏–ª —Ç–µ–ø–ª–æ–≤–∏–∑–∏–æ–Ω–Ω—ã–µ –æ—á–∫–∏ ¬´–°—Ç—Ä–µ–∫–æ–∑–∞¬ª ...
241,136585,rbc_news,2025-11-25,101214,–†–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –ø–æ—Å–ª–∞ –≤—ã–∑–≤–∞–ª–∏ –≤ –ú–ò–î –ú–æ–ª–¥–∞–≤–∏–∏ –∏–∑-–∑–∞...


In [74]:
create_table_query = """
drop table if exists posts;
"""

try:
    with engine.begin() as conn:
        conn.execute(text(create_table_query))
except Exception as e:
    print("–û—à–∏–±–∫–∞:", e)

rbc.to_sql('posts', engine, if_exists='replace', index=False)

800

In [22]:
with engine.connect() as conn:
    df = pd.read_sql('''
select * from posts
    ''', conn)

df

Unnamed: 0,message_id,channel_id,message_dt,views,content
0,137228,rbc_news,2025-12-03,40045,–°—É–¥ –ø—Ä–∏–∑–Ω–∞–ª –ø–∏—Å–∞—Ç–µ–ª—è –ë–æ—Ä–∏—Å–∞ –ê–∫—É–Ω–∏–Ω–∞ (–Ω–∞—Å—Ç–æ—è—â–µ–µ...
1,137226,rbc_news,2025-12-03,53463,"–ù–∞ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–µ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–æ–≤ —Å –°–®–ê, –ø—Ä–æ—à–µ–¥—à–∏—Ö –Ω–∞–∫–∞..."
2,137224,rbc_news,2025-12-03,56667,–ï–≤—Ä–æ–∫–æ–º–∏—Å—Å–∏—è –Ω–∞–º–µ—Ä–µ–Ω–∞ –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –∏—Å–ø–æ–ª–Ω–µ–Ω–∏–µ –≤–Ω—É...
3,137223,rbc_news,2025-12-03,55604,üéô –ü—Ä—è–º–æ —Å–µ–π—á–∞—Å –≤ —ç—Ñ–∏—Ä–µ –†–∞–¥–∏–æ –†–ë–ö –æ–±—Å—É–∂–¥–∞–µ–º –ø–ª–∞...
4,137222,rbc_news,2025-12-03,61798,–ö–æ–ª–ª–µ–≥–∏—è –ï–≤—Ä–æ–∫–æ–º–∏—Å—Å–∏–∏ –æ–¥–æ–±—Ä–∏–ª–∞ ¬´–ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã–π ...
...,...,...,...,...,...
4820,116046,rbc_news,2025-04-15,106022,"–†–µ–ø–æ—Ä—Ç–∞–∂ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª–∞ –†–ë–ö –∏–∑ –ö—É—Ä—Å–∫–∞, –∫–æ—Ç–æ—Ä—ã–π –Ω–æ—á..."
4821,116045,rbc_news,2025-04-15,108803,–°—É–¥ –Ω–∞ –°–∞—Ö–∞–ª–∏–Ω–µ –≤—ã–Ω–µ—Å –ø–µ—Ä–≤–æ–µ —Ä–µ—à–µ–Ω–∏–µ –ø–æ –¥–µ–ª—É –æ...
4822,116044,rbc_news,2025-04-15,108074,–û–ø–µ—Ä–∞—Ç–æ—Ä –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω—ã—Ö –ª–æ—Ç–µ—Ä–µ–π –§—Ä–∞–Ω—Ü–∏–∏ —Å—Ç–∞–ª –æ—Ç–≤...
4823,116041,rbc_news,2025-04-15,125020,–ê—ç—Ä–æ–ø–æ—Ä—Ç —é–∂–Ω–æ–∫–æ—Ä–µ–π—Å–∫–æ–≥–æ –≥–æ—Ä–æ–¥–∞ –ú—É–∞–Ω —Ä–µ–≥—É–ª—è—Ä–Ω–æ ...


In [40]:
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
import numpy as np
import os

load_dotenv()

QDRANT_URL = os.getenv('QDRANT_URL')

client = QdrantClient(
    url=QDRANT_URL,
)

In [2]:
client.create_collection(
    collection_name="my_collection",
    vectors_config=models.VectorParams(
        size=384, 
        distance=models.Distance.COSINE
    )
)


True

In [None]:
points = [
    models.PointStruct(
        id=1,
        vector=np.ones(384).tolist(),
        payload={"text": "–ü—Ä–∏–º–µ—Ä –¥–æ–∫—É–º–µ–Ω—Ç–∞ 1"}
    ),
    models.PointStruct(
        id=2,
        vector=(-np.ones(384)).tolist(),
        payload={"text": "–ü—Ä–∏–º–µ—Ä –¥–æ–∫—É–º–µ–Ω—Ç–∞ 2"}
    )
]

client.upsert(
    collection_name="my_collection",
    points=points,
    wait=True
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [42]:
results = client.query_points(
    collection_name="my_collection",
    query=(-np.ones(384)).tolist(),
    limit=1
)

In [43]:
results.points[0]

ScoredPoint(id=2, version=3, score=0.9999998, payload={'text': '–ü—Ä–∏–º–µ—Ä –¥–æ–∫—É–º–µ–Ω—Ç–∞ 2'}, vector=None, shard_key=None, order_value=None)