In [1]:
import pandas as pd
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from sklearn.cluster import KMeans
import helpers
import pickle
import pathlib

Using Russia server backend.


In [2]:
wish_list_df = pd.read_csv('data/db/wish_lists.csv')
wish_list_df = wish_list_df[['id', 'title']]
print('wish lists:', len(wish_list_df))

wish_list_df.head()

wish lists: 2405


Unnamed: 0,id,title
0,d461b35a519d8af010947904,For the soul
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞


In [3]:
w_titles = wish_list_df['title'].to_list()
wish_list_df['new_title'] = [helpers.clear_text(title) for title in w_titles]
wish_list_df.head()

Unnamed: 0,id,title,new_title
0,d461b35a519d8af010947904,For the soul,for the soul
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞,–¥–ª—è –¥–æ–º–∞
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π,–¥–ª—è –¥–µ—Ç–µ–π
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏,—à–º–æ—Ç–∫–∏
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞,—Ç–µ—Ö–Ω–∏–∫–∞


In [4]:
def get_lang_detector(nlp, name):
    return LanguageDetector()


lang_nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
lang_nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7f7be4ff6d90>

In [5]:
langs = []
for title in w_titles:
    doc = lang_nlp(title)
    langs.append(doc._.language['language'])

wish_list_df['lang'] = langs
wish_list_df.head()

Unnamed: 0,id,title,new_title,lang
0,d461b35a519d8af010947904,For the soul,for the soul,en
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞,–¥–ª—è –¥–æ–º–∞,uk
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π,–¥–ª—è –¥–µ—Ç–µ–π,ru
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏,—à–º–æ—Ç–∫–∏,mk
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞,—Ç–µ—Ö–Ω–∏–∫–∞,ru


In [6]:
print('all wishes:', len(wish_list_df))
wish_list_df = wish_list_df[wish_list_df['new_title'] != '']
print('filtered:', len(wish_list_df))

all wishes: 2405
filtered: 2347


In [7]:
need_translation = True

translations_file = 'data/result/translations.pkl'
my_file = pathlib.Path(translations_file)
if my_file.is_file():
    with open(translations_file, 'rb') as f:
        translations = pickle.load(f)
    need_translation = False

need_translation = True
if need_translation:
    to_translate = wish_list_df[~wish_list_df['lang'].isin(['ru', 'UNKNOWN'])]['new_title'].unique()
    print('Titles to translate:', len(to_translate))
    print(to_translate[:3])

    translations = {}
    for i, title in enumerate(to_translate):
        russian_text = helpers.ru_text(title)
        translations[title] = russian_text
        if i % 50 == 0:
            print(f'{i} wishlist translated')

    with open(translations_file, 'wb') as f:
        pickle.dump(translations, f)
    print(f'translations are saved to {translations_file}')

else:
    print(f'translations are loaded from {translations_file}')

Titles to translate: 646
['for the soul' '–¥–ª—è –¥–æ–º–∞' '—à–º–æ—Ç–∫–∏']
0 wishlist translated
50 wishlist translated
100 wishlist translated
150 wishlist translated
200 wishlist translated
250 wishlist translated
300 wishlist translated
350 wishlist translated
400 wishlist translated
450 wishlist translated
500 wishlist translated
550 wishlist translated
600 wishlist translated
translations are saved to data/result/translations.pkl


In [8]:
def get_ru_text(row):
    if row['lang'] in ['ru', 'UNKNOWN']:
        return row['new_title']
    else:
        new_title = translations[row['new_title']]
        return new_title.strip().lower()

wish_list_df['ru_title'] = wish_list_df.apply(lambda row: get_ru_text(row), axis=1)
wish_list_df.head()

Unnamed: 0,id,title,new_title,lang,ru_title
0,d461b35a519d8af010947904,For the soul,for the soul,en,–¥–ª—è –¥—É—à–∏
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞,–¥–ª—è –¥–æ–º–∞,uk,–¥–ª—è –¥–æ–º–∞
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π,–¥–ª—è –¥–µ—Ç–µ–π,ru,–¥–ª—è –¥–µ—Ç–µ–π
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏,—à–º–æ—Ç–∫–∏,mk,—à–º–æ—Ç–∫–∏
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞,—Ç–µ—Ö–Ω–∏–∫–∞,ru,—Ç–µ—Ö–Ω–∏–∫–∞


In [9]:
nlp = spacy.load('ru_core_news_md')

stop_words = {'–ø–æ–¥–∞—Ä–∫–∏', '–ø–æ–¥–∞—Ä–æ–∫', '–ø–æ–¥–∞—Ä–∏—Ç—å', '–ø–æ–¥–∞—Ä–æ—á–∫–∏', '—Å–ø–∏—Å–æ–∫',
              '–≤—Å—è–∫–∏–π', '–¥—Ä—É–≥–∏–º', '–¥—Ä—É–≥–∏—Ö', '–∂–µ–ª–∞–Ω–∏–µ', '–ø–æ–∂–µ–ª–∞–Ω–∏–µ', '–Ω–∞–¥–æ',
              '–æ—á–µ–Ω—å', '—Ö–æ—á—É', '—Ö–æ—Ç–µ—Ç—å', '—Ö–æ—Ç–µ—Ç—å—Å—è', '–Ω–≥', '–¥—Ä',
              '–¥–µ–Ω—å —Ä–æ–∂–¥–µ–Ω–∏–µ', '–ª—é–±–æ–π', '–∏–¥–µ—è', '–≤–∏—à', '–ø—Ä–æ—á–∏–µ', '–ø—Ä–æ—á–∞—è',
              '–≤–∏—à–ª–∏—Å—Ç', '–≤–∏—à –ª–∏—Å—Ç', '—Ä–∞–∑–Ω–æ–µ', '—Ö–æ—Ä–æ—à–∏–π', '–∂–µ–ª–∞–Ω–∏—è', '–∏–¥–µ—è',
              '–Ω–æ–≤–æ–≥–æ–¥–Ω–∏–π', '—Ç–µ—Å—Ç', 'test', 'zz', 'zzz', 'yyy', '—ã', 'a',
              '–∫—É–ø–∏—Ç—å', '–ø–æ–∫—É–ø–∫–∞', '–ø—Ä–æ–¥–∞—Ç—å', '–ø–æ–∫—É–ø–∞—Ç—å', '–≤–∏—à–ª–∏—Å—Ç–∏–∫',
              '–ø–æ–ª—É—á–∏—Ç—å', '–≤—Å–µ–≥–¥–∞', 'tmp'}
nlp.Defaults.stop_words |= stop_words

ru_names = pd.read_json('data/names_table.jsonl', lines=True)
names = sorted(ru_names['text'])
names = [name.lower() for name in names]

print('ru model is loaded')

ru model is loaded


In [10]:
wish_list_df['new'] = wish_list_df.apply(lambda row: helpers.wishlist_text(nlp, row['ru_title'], names), axis=1)
wish_list_df.head()

Unnamed: 0,id,title,new_title,lang,ru_title,new
0,d461b35a519d8af010947904,For the soul,for the soul,en,–¥–ª—è –¥—É—à–∏,–¥—É—à–∞
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞,–¥–ª—è –¥–æ–º–∞,uk,–¥–ª—è –¥–æ–º–∞,–¥–æ–º
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π,–¥–ª—è –¥–µ—Ç–µ–π,ru,–¥–ª—è –¥–µ—Ç–µ–π,—Ä–µ–±—ë–Ω–æ–∫
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏,—à–º–æ—Ç–∫–∏,mk,—à–º–æ—Ç–∫–∏,—à–º–æ—Ç–∫–∏
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞,—Ç–µ—Ö–Ω–∏–∫–∞,ru,—Ç–µ—Ö–Ω–∏–∫–∞,—Ç–µ—Ö–Ω–∏–∫–∞


In [12]:
wish_list_df = wish_list_df[['id', 'title', 'lang', 'new']]
file_to_save = 'data/result/new_wishlists.csv'
wish_list_df.to_csv(file_to_save)
print(f'wishlists are saved to {file_to_save}')

wish_list_df.head()

wishlists are saved to data/result/new_wishlists.csv


Unnamed: 0,id,title,lang,new
0,d461b35a519d8af010947904,For the soul,en,–¥—É—à–∞
1,cd5fae468ca89c6478416068,–î–ª—è –¥–æ–º–∞,uk,–¥–æ–º
2,5c5fc79ce81a1a1030838078,üë∂üèª –î–ª—è –¥–µ—Ç–µ–π,ru,—Ä–µ–±—ë–Ω–æ–∫
3,945fdb57abc3e52529717750,–®–º–æ—Ç–∫–∏,mk,—à–º–æ—Ç–∫–∏
4,505fc79cdff2233891049727,üì±–¢–µ—Ö–Ω–∏–∫–∞,ru,—Ç–µ—Ö–Ω–∏–∫–∞


In [13]:
titles = wish_list_df[wish_list_df['new'] != '']['new'].unique()
wish_list_vecs = [nlp(wish_list).vector for wish_list in titles]
print('vectors are ready')

vectors are ready


In [14]:
cluster_wishlists = pd.DataFrame({'title': titles})
cluster_count = 100
kmeans = KMeans(n_clusters=cluster_count, random_state=0).fit(wish_list_vecs)
cluster_wishlists['label'] = kmeans.labels_

cluster_wishlists.head()

Unnamed: 0,title,label
0,–¥—É—à–∞,1
1,–¥–æ–º,1
2,—Ä–µ–±—ë–Ω–æ–∫,42
3,—à–º–æ—Ç–∫–∏,4
4,—Ç–µ—Ö–Ω–∏–∫–∞,87


In [15]:
for i in range(cluster_count):
    cluster_wish_lists = cluster_wishlists[cluster_wishlists['label'] == i]['title'].to_list()
    label_count = len(cluster_wish_lists)
    print(f'Label {i}: {label_count} wish lists')
    for wishlist in cluster_wish_lists:
        print(wishlist)
    print('\n\n')

Label 0: 2 wish lists
—ç–∫–æ
–∏–∫



Label 1: 40 wish lists
–¥—É—à–∞
–¥–æ–º
–Ω–æ–≤—ã–π
—Ö–æ–¥–∏—Ç—å
–¥–µ–ª–∞—Ç—å
–¥—Ä—É–≥–æ–µ
–±–æ–ª—å—à–æ–π
—Ü–µ–ª—å
–ª—é–±–∏—Ç—å –ª—é–±–∏—Ç—å
–¥–µ–ª–æ
–º–æ–º–µ–Ω—Ç
–º–µ—Å—Ç–æ
–º—É–∂
–ø–µ—Ä–≤—ã–π
–∫–æ–º–Ω–∞—Ç–∞
—Å–µ–º—å—è
–¥—Ä—É–≥
–Ω–æ–≤—ã–π –¥–æ–º
–ø–æ—á—Ç–∏ —Ç—Ä–∏–¥—Ü–∞—Ç—å
–¥–æ–º–æ–π
—Å–ª—É—á–∞–π
—Å–µ–º—å—è –¥–æ–º
–º–µ—á—Ç–∞—Ç—å
–¥–≤–∏–≥–∞—Ç—å—Å—è
—Å–∫–æ—Ä–µ–µ —Ü–µ–ª—å
–∞
–∑–Ω–∞—Ç—å
—á–∞—Å
–±—É–¥—É—â–µ–µ
–∏–º–µ—Ç—å
–ø—Ä–æ
–ø—Ä–æ—Å—Ç–æ
–º–µ—Å—Ç–æ –≥–¥–µ –ø–æ–±—ã–≤–∞—Ç—å
–º–æ–∂–Ω–æ
–ø—Ä–æ—á–µ–µ
–≤–∏–¥–µ—Ç—å
—Å–µ–º—å—è –¥—Ä—É–≥
–ø–µ—Ä–≤—ã–π –¥–µ–ª–æ
–ª–∏—Ü–æ
–Ω–æ–≤—ã–π –∂–∏–∑–Ω—å



Label 2: 204 wish lists
—Ö–∏—Ä—É—Ä–≥–∏—á–∫–∞
3000 ‚ÇΩ
–¥–µ—Ä–µ–≤–æ–æ–±—Ä–∞–±–∞—Ç—ã–≤–∞–Ω–∏–µ
< 1500
—Ç–µ—Å—Ç–æ–≤—ã–π –∑–∞–¥–∞–Ω–∏–µ
–Ω–∞—Å—Ç–æ–ª–∫–∏
–Ω–µ—Ö—É–¥–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–∞
–ø–∞–ª–∏—Ç—Ç–∞
–ø–ª–∞–∫–∞—Ç—å —Ç–µ—Ö–Ω–æ
–º–µ–¥—Å–µ—Å—Ç—Ä–∞
—Ö–æ—Ç–µ–ª–∫–∏
—É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ
pandora
—á–∞—à–∫–∞
–∂–∏–≤–æ—Ç–Ω–æ–µ
g.a.s
–∞–Ω–¥—Ä–µ—à–∏
$ $ $
—Ö–æ—á—É—à–∫–∏
