In [1]:
import numpy as np
import pandas as pd
import tqdm
import re
from os.path import join as pathjoin

In [2]:
ru_topics = {
    'music': ['музыка', 'рок-группа', 'альбом', 'песня', 'звук', 'рок', 'гитара', 'джаз', 'рэп', 'музыкант'],
    'education': ['студент', 'обучение', 'курсы', 'изучение', 'навыки', 'образование', 'учитель', 'школа', 'университет'],
    'politics': ['выборы', 'голосование', 'ассамблея', 'кандидат', 'демократия', 
                 'консул', 'министр', 'парламент', 'политик', 'законодательный', 'республиканский', 'собрание'],
    'sport': ['футбол', 'тренер', 'баскетбол', 'турнир', 'расписание', 'игры', 'лига', 'дивизион', 'команда', 'полузащитник'],
    'business': ['технология', 'платформа', 'компания', 'производство', 'продукт', 'дизайн',
                 'обновление', 'автоматизация', 'пользователь', 'эксплуатация'],
    'literature': ['книга', 'литература', 'художественная', 'толстой', 'шекспир', 'достоевский', 'роман', 'фанфин',
                   'приключение'],
    'crime': ['полиция', 'суд', 'преступник', 'инцидент', 'штраф', 'преступление', 'тюрьма', 'расследование', 
              'расследовать', 'жертва'],
    'travel': ['турист', 'туризм', 'путешествие', 'море', 'каникулы', 'пляж', 'отель', 'хостел', 'отпуск'],
    'games': ['игры', 'xbox', 'игрок', 'steam', 'карты', 'игроки', 'урон', 'switch', 'дракон', 'персонаж', 
              'геймплей', 'консоль'],
    'arhitecture': ['здание', 'построить', 'церковь', 'строение', 'мост', 
                    'крыша', 'улица', 'стиль', 'башня', 'спроектирован', 'камень',
                    'архитектура']
}

In [3]:
big_df_ru = pd.read_csv('/home/mlepekhin/data/ru_train')
big_df_ru.head()

Unnamed: 0.1,Unnamed: 0,target,text
0,1532,A8,ОАО « Нижнекамскнефтехим » ( НКНХ ) не отказыв...
1,389,A11,... в ходе написания ходатайства : сделать его...
2,207,A14,3.2 . Т опливо и его характеристики . 3.3 . М ...
3,1574,A8,Президент России Дмитрий Медведев в субботу на...
4,196,A16,Что заставляло человечество меняться к лучшему...


In [4]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [17]:
def get_texts_for_topic_ru(topic_words, df, sent_for_topic):
    result = []
    
    for text in df.text.values[:10000]:
        prefix = ' '.join(text.split()[:100]).lower()
        doc = Doc(prefix)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
            if token.lemma in topic_words:
                result.append(text)
                break
        if len(result) == sent_for_topic:
            return result
    return result


def make_topic_sentences_df(topic_dict, df, sent_for_topic=100, get_texts_for_topic=None):
    result_df = pd.DataFrame()
    
    for target in np.unique(df.target.values):
        genre_df = df[df.target == target]
        for topic, topic_words in tqdm.tqdm(topic_dict.items()):
            for sentence in get_texts_for_topic(set(topic_words), genre_df, sent_for_topic):
                result_df = result_df.append(
                    {'target': target, 'topic': topic, 'text': sentence},
                    ignore_index=True
                )
    return result_df

In [18]:
sent_df_ru = make_topic_sentences_df(ru_topics, big_df_ru, get_texts_for_topic=get_texts_for_topic_ru)

100%|██████████| 10/10 [00:14<00:00,  1.42s/it]
100%|██████████| 10/10 [00:09<00:00,  1.09it/s]
100%|██████████| 10/10 [00:14<00:00,  1.40s/it]
100%|██████████| 10/10 [00:11<00:00,  1.10s/it]
100%|██████████| 10/10 [00:05<00:00,  1.96it/s]
100%|██████████| 10/10 [00:06<00:00,  1.54it/s]
100%|██████████| 10/10 [00:04<00:00,  2.32it/s]
100%|██████████| 10/10 [00:04<00:00,  2.49it/s]
100%|██████████| 10/10 [00:38<00:00,  3.82s/it]
100%|██████████| 10/10 [00:04<00:00,  2.04it/s]


In [19]:
print(sent_df_ru.shape)
#print(sent_df_ru.sentence.values[:5])
sent_df_ru.head()

(864, 3)


Unnamed: 0,target,text,topic
0,A1,"Mar 1 , 2014 at 11:34 pm Отчизна катится к фаш...",music
1,A1,"На 50 % тише Работая над тем , чтобы сделать с...",music
2,A1,"ярмарка в марте несмотря на то , что в предыду...",music
3,A1,"<p> Говорят , что правое полушарие отвечает за...",music
4,A1,« Окрашивание » африканских технологий * * * *...,education


In [20]:
!mkdir '/home/mlepekhin/data/smart_genre_accurate_labels'
!mkdir -p '/home/mlepekhin/data/smart_genre_accurate_labels/ru'
!mkdir -p '/home/mlepekhin/data/smart_genre_accurate_labels/en'

mkdir: cannot create directory ‘/home/mlepekhin/data/smart_genre_accurate_labels’: File exists


In [21]:
def split_by_genres(df, result_dir):
    result_dict = {}
    
    for text, target in zip(df['text'].values, df['target'].values):
        if target not in result_dict:
            result_dict[target] = []
        result_dict[target].append(text)
        
    for key, value in result_dict.items():
        with open(pathjoin(result_dir, f'{key}.txt'), 'w') as fout:
            for text in value:
                fout.write(f'{text}\n')

In [22]:
#split_by_genres(sent_df_ru, '/home/mlepekhin/data/smart_genre/ru')

In [23]:
!ls '/home/mlepekhin/data/smart_genre_accurate_labels/ru'
!wc -l '/home/mlepekhin/data/smart_genre/ru/A1.txt'

all.csv
2000 /home/mlepekhin/data/smart_genre/ru/A1.txt


In [24]:
sent_df_ru.to_csv('/home/mlepekhin/data/smart_genre_accurate_labels/ru/all.csv')

## English

In [25]:
big_df_en = pd.read_csv('/home/mlepekhin/data/en_train')
big_df_en.head()

Unnamed: 0.1,Unnamed: 0,target,text
0,1605,A8,"( INDIANAPOLIS – APRIL 16 , 2010 ) – Ash conti..."
1,296,A1,""" Apache vs Yaws · Spoof signs "" A succinct in..."
2,664,A12,ENQUIRY AND ADVICE : Usually arrive via teleph...
3,560,A22,Pay Someone To Write College Essay ADI special...
4,1168,A1,Moral Difference Between Hitting a Computer an...


In [26]:
en_topics = {
    'music': ['music', 'band', 'album', 'songs', 'sound', 'love', 'rock', 'playing', 'guitar', 'jazz'],
    'education': ['student', 'learning', 'courses', 'teaching', 'skills', 'education', 'study', 'college'],
    'politics': ['election', 'votes', 'assembly', 'candidate', 'democratic', 
                 'council', 'minister', 'parliament', 'politician', 'legislative', 'seats', 'vote'],
    'sport': ['football', 'coach', 'basketball', 'tournament', 'schedule', 'games', 'league', 'division', 'team', 'teams'],
    'business': ['technology', 'platform', 'companies', 'industry', 'product', 'design',
                 'upgrade', 'automation', 'users', 'ideas'],
    'literature': ['books', 'literature', 'fiction', 'tolstoy', 'shakespeare', 'dostoevsky', 'romanism', 'fanfics', 'adventure'],
    'crime': ['police', 'court', 'officer', 'incident', 'charges', 'crime', 'prison', 'investigation', 'accused', 'victim'],
    'travel': ['tourist', 'tourism', 'travel', 'seaside', 'vacation', 'beach', 'hotel', 'hostel'],
    'games': ['games', 'xbox', 'players', 'steam', 'cards', 'player', 'damage', 'switch', 'dragon', 'character', 
              'reload', 'console'],
    'arhitecture': ['building', 'historic', 'church', 'buildings', 'brick', 
                    'roof', 'street', 'style', 'tower', 'designed', 'stone', 'architecture']
}

In [27]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 

rocks : rock
corpora : corpus


In [28]:
def get_texts_for_topic_en(topic_words, df, sent_for_topic):
    result = []
    
    for text in df.text.values:
        prefix_tokens = text.lower().split()[:100]
        for token in prefix_tokens:
            if lemmatizer.lemmatize(token) in topic_words:
                result.append(text)
                break
        if len(result) == sent_for_topic:
            return result
    return result

In [29]:
sent_df_en = make_topic_sentences_df(en_topics, big_df_en, get_texts_for_topic=get_texts_for_topic_en)

100%|██████████| 10/10 [00:01<00:00,  8.35it/s]
100%|██████████| 10/10 [00:00<00:00, 32.33it/s]
100%|██████████| 10/10 [00:00<00:00, 11.90it/s]
100%|██████████| 10/10 [00:00<00:00, 23.66it/s]
100%|██████████| 10/10 [00:00<00:00, 17.08it/s]
100%|██████████| 10/10 [00:00<00:00, 50.00it/s]
100%|██████████| 10/10 [00:00<00:00, 28.94it/s]
100%|██████████| 10/10 [00:00<00:00, 34.59it/s]
100%|██████████| 10/10 [00:00<00:00, 19.90it/s]
100%|██████████| 10/10 [00:00<00:00, 21.77it/s]
100%|██████████| 10/10 [00:00<00:00, 40.05it/s]


In [30]:
print(sent_df_en.shape)
sent_df_en.head()

(717, 3)


Unnamed: 0,target,text,topic
0,A1,Thank you so much . It 's really scary to be h...,music
1,A1,The End of the Russia-China Debate Ten years a...,music
2,A1,School Choice – An Educational Custom Fit Imag...,music
3,A1,Tackling human organ donation dilemma By Andy ...,music
4,A1,Such changes are especially likely because the...,music


In [31]:
sent_df_ru.to_csv('/home/mlepekhin/data/smart_genre_accurate_labels/en/all.csv')

In [32]:
#split_by_genres(sent_df_en, '/home/mlepekhin/data/smart_genre/en')

In [33]:
!ls '/home/mlepekhin/data/smart_genre/en'
!wc -l '/home/mlepekhin/data/smart_genre/en/A1.txt'

A11.txt  A14.txt  A17.txt  A22.txt  A7.txt  A9.txt
A12.txt  A16.txt  A1.txt   A4.txt   A8.txt  all.csv
1994 /home/mlepekhin/data/smart_genre/en/A1.txt


In [34]:
!ls '/home/mlepekhin/data/smart_genre/ru/all.csv'

/home/mlepekhin/data/smart_genre/ru/all.csv
