In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install elasticsearch eng_to_ipa faiss-cpu numpy==1.25 fasttext symspellpy pandarallel rapidfuzz yargy catboost

Collecting elasticsearch
  Downloading elasticsearch-8.11.0-py3-none-any.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.6/412.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting eng_to_ipa
  Downloading eng_to_ipa-0.0.2.tar.gz (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.25
  Downloading numpy-1.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext
  Downloading fasttext-0.9

In [5]:
path = '/content/drive/MyDrive/JETFORK_11_2023/data/'

In [6]:
import sys
sys.path.insert(0, path)

from pandarallel import pandarallel
import rapidfuzz
import re
import json
import fasttext
import faiss
import pandas as pd
import pickle
import numpy as np
from elasticsearch import Elasticsearch, helpers
from tqdm import tqdm
from yargy import (
    Parser,
    rule, or_, and_
)
from yargy.predicates import (
    eq, type, normalized,
    gte, lte,
    dictionary,
)
from yargy.interpretation import fact
from catboost import CatBoostRanker, Pool
pandarallel.initialize(progress_bar=False)

from spelling_checker.sym_spell_servicer import SymSpellRouterServicer

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Подгрузка запросов

In [7]:
df = pd.read_csv(path + 'test_dataset_submission_queries.csv')

In [8]:
df

Unnamed: 0,query
0,Битва сильнейших экстрасенсов 2023 смотреть | ...
1,битва сильнейших экстрасенсов 2023\nбитва силь...
2,"Экстрасенсы. Битва сильнейших, 4 выпуск"
3,супер стар 4 сезон
4,пять ночей с Фредди
...,...
1995,Шоу аватар 2023
1996,Макс
1997,смотреть мужское женское 2019
1998,шоу вована и лексуса


# Предобработка и исправление опечаток

In [9]:
spell_checker = SymSpellRouterServicer()

In [10]:
%%time
df['clean_query'] = df['query'].apply(lambda x: spell_checker.predict_single_correction(
                                                                                          x,
                                                                                          use_preprocessing=True,
                                                                                          use_keyboard_inverter=False,
                                                                                          use_correction=True,
                                                                                  )[1])

CPU times: user 2min 21s, sys: 363 ms, total: 2min 22s
Wall time: 2min 22s


In [16]:
124/2000

0.062

# Поиск похожих запросов в faiss индексе

In [11]:
with open(path + 'ind2queries_ft.json', 'r') as f:
    ind2query_ft = json.load(f)
model = fasttext.load_model(path + 'model/00_fasttext_queries.bin')
index = faiss.read_index(path + 'faiss_ft_index.index')



# Поиск кандидатов в индексе в эластике

In [12]:
es = Elasticsearch("http://5.35.83.33:9200/", basic_auth=("elastic", ""))

In [19]:
def construct_where_condition(query: str, priority_factor: float = 1.0) -> dict:
    if not query:
        return []
    return [
        {
            "match": {
                "processed_video_title": {
                    "query": query,
                    "boost": 3 * priority_factor,  # Большой boost для точных совпадений
                }
            }
        },
        {
            "match": {
                "processed_video_title.ngram": {
                    "query": query,
                    "boost": 2 * priority_factor,  # Умеренный boost для n-gram совпадений
                }
            }
        },
        {
            "fuzzy": {
                "processed_video_title.fuzzy": {
                    "value": query,
                    "fuzziness": "AUTO",
                    "prefix_length": 0,
                    "max_expansions": 50,
                    "transpositions": True,
                    "rewrite": "constant_score_blended",
                    "boost": 1 * priority_factor,  # Небольшой boost для fuzzy-поиска
                }
            }
        },
        {
            "match": {
                "processed_channel_title": {
                    "query": query,
                    "boost": 1 * priority_factor,  # Низкий boost для каналов
                }
            }
        },
    ]


def query_es_get_best(
    original_query: str,
    page: int = 1,
    page_size: int = 100
):
    query_vector = model.get_sentence_vector(original_query)

    where_condition: list = [
        *construct_where_condition(original_query, 1.0),
    ]
    body = {
        "from": page_size / page,
        "size": page_size,
        "query": {"bool": {"should": where_condition}},
        "knn": {
            "field": "video-vector",
            "query_vector": query_vector,
            "k": 5,
            "num_candidates": 50,
            "boost": 2,
        },
        "_source": [
            "id",
            "source_channel_title",
            "processed_video_title",
            "processed_channel_title",
        ],
    }
    index = 'videos_index__v1'

    response = es.search(index=index, body=body)

    return [hit["_source"] for hit in response["hits"]["hits"]]

In [20]:
candidates = [(query,  query_es_get_best(query))
              for query in tqdm(df['clean_query'].values)]

100%|██████████| 2000/2000 [15:12<00:00,  2.19it/s]


# Реранжирование кандидатов

In [21]:
nan_replacements = {
    'v_year_views': 0,
    'v_month_views': 0,
    'v_week_views': 0,
    'v_day_views': 0,
    'v_likes': 0,
    'v_dislikes': 0,
    'v_cr_click_like_7_days': 0,
    'v_cr_click_vtop_7_days': 0,
    'v_cr_click_long_view_7_days': 0,
    'v_cr_click_comment_7_days': 0,
    'v_cr_click_like_30_days': 0,
    'v_cr_click_vtop_30_days': 0,
    'v_cr_click_long_view_30_days': 0,
    'v_cr_click_comment_30_days': 0,
    'v_cr_click_like_1_days': 0,
    'v_cr_click_vtop_1_days': 0,
    'v_cr_click_long_view_1_days': 0,
    'v_cr_click_comment_1_days': 0,
    'query_video_avg_watchtime': 0,
    'query_video_avg_comment': 0,
    'query_video_num_clicks': 0,
    'query_channel_avg_watchtime': 0,
    'query_channel_num_clicks': 0,
    'mean_rel_video_per_query': 0,
    'mean_rel_query_per_video': 0,
    'mean_rel_tokens_per_video': 0,
}

In [22]:
DOT = eq('.')
INT = type('INT')
SPACE = eq(' ')


Date = fact(
    'Date',
    ['year', 'month', 'day']
)


MONTHS = {
    'январь': 1,
    'февраль': 2,
    'март': 3,
    'апрель': 4,
    'май': 5,
    'июнь': 6,
    'июль': 7,
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12,
}

MONTH_NAME = dictionary(MONTHS).interpretation(
    Date.month.normalized().custom(MONTHS.get)
)

MONTH = and_(
    INT,
    gte(1),
    lte(12)
).interpretation(
    Date.month.custom(int)
)

YEAR = and_(
    INT,
    gte(1000),
    lte(3000)
).interpretation(
    Date.year.custom(int)
)

YEAR_SUFFIX = rule(
    or_(
        eq('г'),
        normalized('год')
    ),
    DOT.optional()
)

DAY = and_(
    INT,
    gte(1),
    lte(31)
).interpretation(
    Date.day.custom(int)
)

DATE = or_(
    rule(
        YEAR,
        YEAR_SUFFIX
    ),
    rule(
        MONTH_NAME,
        YEAR
    ),
    rule(
        DAY,
        DOT.optional(),
        MONTH,
        DOT.optional(),
        YEAR
    ),
    rule(
        YEAR,
        DOT.optional(),
        MONTH,
        DOT.optional(),
        DAY
    ),
    rule(
        DAY,
        MONTH_NAME,
        YEAR
    ),

)

DATE = rule(
    DATE,
).interpretation(
    Date
)

In [23]:
date_parser = Parser(DATE)

query_hist = pd.read_parquet(path + 'for_inference/query_hist.parquet') \
  .rename(columns={'video_id': 'id'})
query_channel_hist = pd.read_parquet(path + 'for_inference/query_channel_hist.parquet') \
  .rename(columns={'channel_title': 'source_channel_title'})
with open(path + 'for_inference/query_video_rel.pickle', 'rb') as handle:
    query_video_rel = pickle.load(handle)
with open(path + 'for_inference/video_query_rel.pickle', 'rb') as handle:
    video_query_rel = pickle.load(handle)
with open(path + 'for_inference/video_query_rel_2.pickle', 'rb') as handle:
    video_query_rel_2 = pickle.load(handle)
with open(path + 'for_inference/features_nov-001.pickle', 'rb') as handle:
    video_features = pickle.load(handle)
with open(path + 'for_inference/backfill.pickle', 'rb') as handle:
    backfill = pickle.load(handle)

ranker = CatBoostRanker()
ranker.load_model(path + "model/ranker.ckpt")

episode_re = re.compile(r'\d+ *(?:выпуск|серия|эпизод)|(?:выпуск|серия|эпизод) *\d+')
season_re = re.compile(r'\d+ *сезон|сезон *\d+')
number_re = re.compile(r'\d+')

In [24]:
def match_dates(query: str, video: str) -> float:
  res_query = date_parser.findall(query)
  spans_query = [_.span for _ in res_query]

  res_video = date_parser.findall(video)
  spans_video = [_.span for _ in res_video]

  if (not spans_query and spans_video) or \
      (spans_query and not spans_video):
    return 0.0

  if not spans_query and not spans_video:
    return -1.0

  spans_query = query[spans_query[0].start : spans_query[0].stop]
  spans_video = video[spans_video[0].start : spans_video[0].stop]

  if spans_query == spans_video:
    return 1.0

  return len(set(spans_query) & set(spans_video)) / max(len(set(spans_query)), len(set(spans_query)))


def match_episodes(query: str, video: str) -> float:
  res_query = episode_re.search(query)
  res_video = episode_re.search(query)

  if (not res_query and res_video) or \
      (res_query and not res_video):
    return 0.0

  if not res_query and not res_video:
    return -1.0

  res_query = number_re.findall(res_query.group(0))[0]
  res_video = number_re.findall(res_video.group(0))[0]

  if res_query == res_video:
    return 1.0

  return 0.0


def match_season(query: str, video: str) -> float:
  res_query = season_re.search(query)
  res_video = season_re.search(query)

  if (not res_query and res_video) or \
      (res_query and not res_video):
    return 0.0

  if not res_query and not res_video:
    return -1.0

  res_query = number_re.findall(res_query.group(0))[0]
  res_video = number_re.findall(res_video.group(0))[0]

  if res_query == res_video:
    return 1.0

  return 0.0

In [25]:
def rerank(query: str, candidates: list[dict[str, str]], k: int = 5) -> list[str]:
    try:
        df = pd.concat([
            pd.DataFrame({'query': [query] * len(candidates)}),
            pd.DataFrame.from_records(candidates)
        ], axis=1)
        df['clean_video_text'] = df['processed_video_title'].fillna('') + ' ' + df['processed_channel_title'].fillna('')
        df.drop(['processed_video_title', 'processed_channel_title'], axis=1, inplace=True)

        df['v_year_views'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_year_views', 0))
        df['v_month_views'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_month_views', 0))
        df['v_week_views'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_week_views', 0))
        df['v_day_views'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_day_views', 0))
        df['v_likes'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_likes', 0))
        df['v_dislikes'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_dislikes', 0))
        df['v_duration'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_duration', 0))
        df['v_cr_click_like_7_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_like_7_days', 0))
        df['v_cr_click_vtop_7_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_vtop_7_days', 0))
        df['v_cr_click_long_view_7_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_long_view_7_days', 0))
        df['v_cr_click_comment_7_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_comment_7_days', 0))
        df['v_cr_click_like_30_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_like_30_days', 0))
        df['v_cr_click_vtop_30_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_vtop_30_days', 0))
        df['v_cr_click_long_view_30_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_long_view_30_days', 0))
        df['v_cr_click_comment_30_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_comment_30_days', 0))
        df['v_cr_click_like_1_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_like_1_days', 0))
        df['v_cr_click_vtop_1_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_vtop_1_days', 0))
        df['v_cr_click_long_view_1_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_long_view_1_days', 0))
        df['v_cr_click_long_view_1_days'] = df['id'].apply(lambda x: video_features.get(x, {}).get('v_cr_click_long_view_1_days', 0))

        df['mean_rel_video_per_query'] = df['query'].apply(lambda x: query_video_rel.get(x, 0))
        df['mean_rel_query_per_video'] = df['id'].apply(lambda x: video_query_rel.get(x, 0))
        df['mean_rel_tokens_per_video'] = df['id'].apply(lambda x: video_query_rel_2.get(x, 0))

        df = df \
          .merge(query_hist, on=['id', 'query'], how='left') \
          .merge(query_channel_hist, on=['source_channel_title', 'query'], how='left')  \

        df = df.fillna(nan_replacements)
        df['query_video/channel_avg_watchtime'] = (df['query_video_avg_watchtime'] / df['query_channel_avg_watchtime']).fillna(0)
        df['dosmostr'] = (df['query_video_avg_watchtime'] / df['v_duration']).fillna(0)
        df['query_num_tokens'] = df['query'].str.split().apply(len)
        df.drop(['query_video_avg_watchtime', 'v_duration', 'query_channel_avg_watchtime'], axis=1, inplace=True)

        df['jaro_winkler'] = df.apply(
            lambda x: rapidfuzz.distance.JaroWinkler.normalized_similarity(x['query'], x['clean_video_text']), axis=1
        )
        df['damerau_levenshtein'] = df.apply(
            lambda x: rapidfuzz.distance.DamerauLevenshtein.normalized_similarity(x['query'], x['clean_video_text']), axis=1
        )
        df['date_similarity'] = df.apply(lambda x: match_dates(x['query'], x['clean_video_text']), axis=1)
        df['same_episode'] = df.apply(lambda x: match_episodes(x['query'], x['clean_video_text']), axis=1)
        df['same_season'] = df.apply(lambda x: match_season(x['query'], x['clean_video_text']), axis=1)

        lookup = df['id'].values
        df.drop(['clean_video_text', 'source_channel_title', 'query', 'id'], axis=1, inplace=True)

        df['group_id'] = [11991199] * df.shape[0]
        pred_scores = ranker.predict(df)

        return lookup[np.argsort(pred_scores)[::-1][:k]].tolist()
    except:
        return [i.get('video_id') for i in np.random.choice(backfill, 5, replace=False)]

In [26]:
result = []

for idx, i in tqdm(enumerate(candidates)):
    len_answer = min(len(i[1]), 5) if len(i[1])!=0 else 5
    res = pd.DataFrame({'query': [idx] * len_answer, 'video_id': rerank(*i)}).explode(['query', 'video_id'])
    result.append(res)

  0%|          | 0/2000 [1:00:09<?, ?it/s]
2000it [39:58,  1.20s/it]


In [29]:
result_v2 = result

In [30]:
result = pd.concat(result, axis=0)

result['cnt'] = result.groupby(['query'])['video_id'].transform('size')
add = result[result['cnt'] < 5]
result.drop('cnt', axis=1, inplace=True)

add = add.drop_duplicates(subset=['query'])
add['video_id'] = add['cnt'].apply(lambda x: [i.get('video_id') for i in np.random.choice(backfill, 5-x, replace=False)], axis=1)
add = add.explode('video_id')

result = pd.concat([result, add], axis=0).reset_index(drop=True)

In [36]:
result.drop('cnt', axis=1).to_csv(path+'/processed__v11.csv', index=False)

In [34]:
result['query'] = result['query'].apply(lambda x: df['query'].iloc[x])

In [35]:
result

Unnamed: 0,query,video_id,cnt
0,Битва сильнейших экстрасенсов 2023 смотреть | ...,video_26788569,
1,Битва сильнейших экстрасенсов 2023 смотреть | ...,video_26483474,
2,Битва сильнейших экстрасенсов 2023 смотреть | ...,video_25322369,
3,Битва сильнейших экстрасенсов 2023 смотреть | ...,video_10552729,
4,Битва сильнейших экстрасенсов 2023 смотреть | ...,video_22736040,
...,...,...,...
9995,мужкое женское выпуск от 04 06 2021,video_10998245,
9996,мужкое женское выпуск от 04 06 2021,video_24042166,
9997,мужкое женское выпуск от 04 06 2021,video_21360705,
9998,мужкое женское выпуск от 04 06 2021,video_1252042,
