In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import sys

sys.path.insert(0, '/content/drive/MyDrive/hack_2023_11')

In [2]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from nltk import ngrams
import os
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

from spelling_checker.sym_spell_servicer import SymSpellRouterServicer

spell_checker = SymSpellRouterServicer()

In [3]:
WORD_PATTERN = re.compile(r"[a-zа-яйё!&'._|\-+:]+")
WORD_STRIP = "&'_|-:"
VALID_CHARS = "!&'._|-+:"

# Load data

In [4]:
video_meta = pd.read_parquet("/content/drive/MyDrive/hack_2023_11/data/videos.parquet")

queries_1 = pd.read_parquet("/content/drive/MyDrive/hack_2023_11/data/automarkup.parquet")
queries_2 = pd.read_csv("/content/drive/MyDrive/hack_2023_11/data/manualmarkup.csv")

In [None]:
video_meta.sample(2)

Unnamed: 0,video_id,video_title,channel_title,v_channel_reg_datetime,v_channel_type,v_category,v_pub_datetime
9901479,video_12509079,"Леонид Андреев, Виктор Таланов. Современная со...",Современная Cоционика * Socionics Archetype Ce...,2014-01-12 13:18:01+04:00,UGC,Лайфстайл,2022-08-12 12:34:21+03:00
21638683,video_2048243,Видео родов кесарево,Искусство чернил и жизни,2023-11-02 16:24:28+03:00,UGC,Разное,2023-11-03 19:06:10+03:00


In [None]:
queries_1.sample(2)

Unnamed: 0,is_authorized,datetime,query,video_id,duration,position,watchtime,emotion,vtop,comment,channel,tv_show,season
3154816,False,2023-08-02 17:11:49+03:00,Выжить в дуба,video_97134,5512000,1.0,5663,0,,0,Телеканал ТНТ,Выжить в Дубае,
1612238,False,2023-08-06 20:45:03+03:00,харламов батрутдинов,video_7109004,609803,24.0,592,0,,0,Comedy Club,Камеди Клаб,


In [None]:
queries_2.sample(2)

Unnamed: 0,query,video_id,query_date,sentiment
11080,пальма продакшн,video_10153963,2023-11-03 18:00:00,uselessly
606,надежда санько,video_8245346,2023-11-03 18:00:00,сlickbait


# Aggregate video attributes

In [5]:
video_meta = pd.DataFrame(video_meta[["video_title", "channel_title"]].unstack().rename("text")).reset_index(drop=True)

video_meta = video_meta[video_meta["text"].progress_apply(lambda x: WORD_PATTERN.search(x) is not None)] \
    .reset_index(drop=True)

100%|██████████| 68809122/68809122 [01:50<00:00, 622505.65it/s]


In [None]:
video_meta["clean_text"] = video_meta["text"] \
    .progress_apply(
    lambda x: spell_checker.predict_single_correction(
        x, use_preprocessing=True, use_keyboard_inverter=False, use_correction=False
    ))

In [None]:
unigrams = defaultdict(int)
for idx, sublist in tqdm(enumerate(video_meta["clean_text"])):
    for word in sublist.split():
        if WORD_PATTERN.search(word) is not None:
            if word in VALID_CHARS:
                pass
            else:
                word = word.strip(WORD_STRIP)
            unigrams[word] = unigrams.get(word, 0) + 1

32115348it [03:20, 159815.53it/s]


In [8]:
bigrams = defaultdict(int)
for sublist in tqdm(video_meta["clean_text"]):
    tokens = []
    for word in sublist.split():
        if WORD_PATTERN.search(word) is not None:
            if word in VALID_CHARS:
                tokens.append(word)
            else:
                tokens.append(word.strip(WORD_STRIP))
    for bigram in ngrams(tokens, 2):
        b = " ".join(bigram)
        bigrams[b] = bigrams.get(b, 0) + 1

100%|██████████| 32115348/32115348 [08:36<00:00, 62135.00it/s]


In [9]:
min_unigram = int(spell_checker.config["SYMSPELL_PARAMS"]["MIN_UNIGRAM_COUNT"])
min_bigram = int(spell_checker.config["SYMSPELL_PARAMS"]["MIN_BIGRAM_COUNT"])

In [10]:
unigrams = {k: v if v >= min_unigram else min_unigram for k, v in unigrams.items()}
bigrams = {k: v if v >= min_bigram else min_bigram for k, v in bigrams.items()}

In [None]:
video_meta.to_parquet("data/clean_videos.parquet", index=False)

# Aggregate queries

In [None]:
queries = pd.DataFrame(pd.concat([queries_1["query"], queries_2["query"]]).rename("text"))

del queries_1, queries_2

In [None]:
queries = queries[queries["text"].progress_apply(lambda x: WORD_PATTERN.search(x) is not None)] \
    .reset_index(drop=True)

100%|██████████| 6251664/6251664 [00:06<00:00, 975303.77it/s] 


In [None]:
queries["clean_text"] = queries["text"] \
    .progress_apply(
    lambda x: spell_checker.predict_single_correction(
        x, use_preprocessing=True, use_keyboard_inverter=False, use_correction=False
    ))

100%|██████████| 6104110/6104110 [02:05<00:00, 48790.20it/s]


In [None]:
q_unigrams = defaultdict(int)
for idx, sublist in tqdm(enumerate(queries["clean_text"])):
    for word in sublist.split():
        if WORD_PATTERN.search(word) is not None:
            if word in VALID_CHARS:
                pass
            else:
                word = word.strip(WORD_STRIP)
            q_unigrams[word] = q_unigrams.get(word, 0) + 1

6104110it [00:17, 341650.52it/s]


In [None]:
q_bigrams = defaultdict(int)
for sublist in tqdm(queries["clean_text"]):
    tokens = []
    for word in sublist.split():
        if WORD_PATTERN.search(word) is not None:
            if word in VALID_CHARS:
                tokens.append(word)
            else:
                tokens.append(word.strip(WORD_STRIP))
    for bigram in ngrams(tokens, 2):
        b = " ".join(bigram)
        q_bigrams[b] = q_bigrams.get(b, 0) + 1

100%|██████████| 6104110/6104110 [00:26<00:00, 227899.01it/s]


In [None]:
np.quantile(list(q_unigrams.values()), .75), np.quantile(list(q_unigrams.values()), .95), np.quantile(list(q_unigrams.values()), .99)

(3.0, 31.0, 321.0)

In [None]:
np.quantile(list(q_bigrams.values()), .75), np.quantile(list(q_bigrams.values()), .95), np.quantile(list(q_bigrams.values()), .99)

(2.0, 9.0, 62.0)

In [None]:
print([k for k, v in q_unigrams.items() if v > 321])

['роман', 'битва', 'экстрасенсов', '2019', 'год', 'сильнейших', '2', 'сезон', 'bitva', 'silneyshix', 'sezon', 'экстросенсы', '10', 'выпуск', 'экстрасенсы', 'серия', '02', 'сильнейший', 'х', '2018', 'новая', 'сезо', 'александр', 'сез', '2сезон', '1', '2020', '20', 'за', 'с', 'сильнеших', '19', '12', '22', '6', 'старые', 'выпуски', '4', 'сильнейшие', '8', '18', 'битвы', 'экстрассенсов', '07', '3', '7', '2023', 'смотреть', '35', 'виктория', 'райдос', 'ведут', 'расследование', 'анонс', 'экстрасены', 'вып', '17', 'и', '23', '13', 'декабря', '2022', 'си', 'топ', 'бесплатно', 'самых', '11', '9', 'экстрасенсев', 'все', 'серии', 'подряд', 'comedy', 'woman', 'камеди', 'вумен', 'выжить', 'в', 'дубае', 'дубаи', 'часть', 'от', 'тнт', 'шоу', 'знаю', 'дубай', 'июля', 'черный', 'двор', 'звезды', 'вдубае', 'большое', 'версия', 'как', 'на', 'воля', 'каникулы', '1сезон', 'я', 'тебе', 'не', 'верю', 'выжившие', '2серия', '14', '01', 'однажды', 'пять', 'минут', 'тишины', '.', '2выпуск', 'дубаях', 'выжать', 

In [None]:
q_unigrams = {k: v if v >= min_unigram else 321 for k, v in q_unigrams.items()}
q_bigrams = {k: v if v >= min_bigram else 62 for k, v in q_bigrams.items()}

# Merge and collect final dicts

In [None]:
for k, v in q_unigrams.items():
    unigrams[k] = unigrams.get(k, 0) + v
del q_unigrams

for k, v in q_bigrams.items():
    bigrams[k] = bigrams.get(k, 0) + v
del q_bigrams

In [None]:
len(unigrams), len(bigrams)

(5420366, 52496323)

In [None]:
with open("spelling_checker/mount_files/symspell/ru.txt", "r", encoding="utf-8") as f:
    rus = f.readlines()

with open("spelling_checker/mount_files/symspell/en.txt", "r", encoding="utf-8") as f:
    eng = f.readlines()

In [None]:
rus = dict([i.strip().split() for i in rus])
eng = dict([i.strip().split() for i in eng])

In [None]:
vocab = defaultdict(int)

In [None]:
for k, v in unigrams.items():
    vocab[k] = int(rus.get(k, 0)) + int(eng.get(k, 0)) + v

In [None]:
filename = os.path.join("/content/drive/MyDrive/hack_2023_11/spelling_checker/mount_files/symspell", "unigrams.txt")

with open(filename, "w", encoding="utf-8") as f:
    for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True):
        f.write(f"{k}${v}\n")

In [14]:
filename = os.path.join("/content/drive/MyDrive/hack_2023_11/spelling_checker/mount_files/symspell", "bigrams.txt")

with open(filename, "w", encoding="utf-8") as f:
    for k, v in sorted(bigrams.items(), key=lambda item: item[1], reverse=True):
        f.write(f"{k}${v}\n")