In [10]:
import os 
import pickle
import msgpack
import time
import pandas as pd
import gensim

# Train model without compounds

In [2]:
class GeneratorSentences:
    def __init__(self, dir_path):
        self._dir_path = dir_path
        
    def __iter__(self):
        for fname in os.listdir(self._dir_path):
            file_path = os.path.join(self._dir_path, fname)
            
            with open(file_path, 'rb') as f:
                annots = msgpack.unpackb(f.read(), raw=False)
            
            if annots:
                for doc_annots in annots:
                    for sent in doc_annots:
                        yield sent

In [None]:
import gensim

#parse_path = '/notebook/projects/compounds/workdir/parse1/'
parse_path = '/notebook/projects/compounds/workdir/parse5/'
model = gensim.models.Word2Vec(GeneratorSentences(parse_path), min_count=5, size=100, workers=10)
model.save('./model_word2vec_no_compounds')

In [41]:
model.wv.most_similar('мыть')

[('чистить', 0.9070878624916077),
 ('помыть', 0.8470351696014404),
 ('мочить', 0.8346104621887207),
 ('протирать', 0.8309078216552734),
 ('намазывать', 0.8274841904640198),
 ('мыться', 0.8230193257331848),
 ('полоскать', 0.8215987682342529),
 ('вытирать', 0.8136080503463745),
 ('мазать', 0.8098058700561523),
 ('замачивать', 0.8080874681472778)]

# Train model with compounds

In [2]:
compounds_path = './workdir/annotation_katya_ref.csv'
df_compounds = pd.read_csv(compounds_path)
df_compounds.head()

Unnamed: 0,Часть 1,Часть 2,Ответ Елены,Ответ Дмитрия,Катя (модератор)
0,автор,программы,2,2,2
1,атмосфера,городов,2,2,2
2,новый,год,2,2,2
3,тонна,грязи,2,2,2
4,второй,эшелон,1,2,2


In [3]:
from isanlp.processor_sentence_splitter import ProcessorSentenceSplitter
from isanlp.ru.processor_tokenizer_ru import ProcessorTokenizerRu
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp import PipelineCommon
from isanlp.wrapper_multi_process_document import WrapperMultiProcessDocument


ppl = PipelineCommon([
    (ProcessorTokenizerRu(), ['text'], {0 : 'tokens'}),
    (ProcessorSentenceSplitter(), ['tokens'], {0 : 'sentences'}),
    (ProcessorMystem(), ['tokens', 'sentences'], {'lemma' : 'lemma'})
])

In [4]:
compound_set = set()
for i in df_compounds.index:
    compound = '{} {}'.format(df_compounds.loc[i, 'Часть 1'], df_compounds.loc[i, 'Часть 2'])
    lemmas = ppl(compound)['lemma'][0]
    compound_set.add('{}_{}'.format(lemmas[0], lemmas[1]))
                     
print(len(compound_set))
list(compound_set)[:5]

996


['год_обучение',
 'электролитный_обмен',
 'сведение_полиция',
 'световой_импульс',
 'духовный_культура']

In [5]:
class GeneratorSentencesCompounds:
    def __init__(self, dir_path):
        self._dir_path = dir_path
        
    def __iter__(self):
        for fname in os.listdir(self._dir_path):
            file_path = os.path.join(self._dir_path, fname)
            
            with open(file_path, 'rb') as f:
                annots = msgpack.unpackb(f.read(), raw=False)
            
            if annots:
                for doc_annots in annots:
                    for sent in doc_annots:
                        additional_sent = []
                        for i in range(len(sent) - 1):
                            compound = '{}_{}'.format(sent[i], sent[i + 1])
                            if compound in compound_set:
                                additional_sent.append(sent[:i] + [compound] + sent[i + 2:])
                            
                        yield sent
                        for add_sent in additional_sent:
                            yield add_sent
                                

In [None]:
import gensim

parse_path = '/notebook/projects/compounds/workdir/parse5/'
save_path = './workdir/models/model_word2vec_compounds5'
model = gensim.models.Word2Vec(GeneratorSentencesCompounds(parse_path), min_count=5, size=500, workers=5)
model.save(save_path)

In [10]:
model.wv.most_similar('автор_программа')

  if np.issubdtype(vec.dtype, np.int):


[('соведущий', 0.6568776369094849),
 ('модератор', 0.6474907398223877),
 ('колумнист', 0.6105226278305054),
 ('фрилансер', 0.6065789461135864),
 ('соорганизатор', 0.6028417348861694),
 ('руководитель_компания', 0.6026309728622437),
 ('редактор', 0.6008354425430298),
 ('сегоднячко', 0.6001319289207458),
 ('сооснователь', 0.5991875529289246),
 ('медиапроект', 0.599036693572998)]

In [12]:
parse_path = '/notebook/projects/compounds/workdir/parse5'
full_data = list(GeneratorSentencesCompounds(parse_path))

In [14]:
model = gensim.models.Word2Vec(full_data, min_count=2, size=300, workers=6)

In [16]:
save_path = './workdir/models/model_word2vec_compounds6'
model.save(save_path)

In [15]:
model.wv.most_similar('автор_программа')

  if np.issubdtype(vec.dtype, np.int):


[('программа', 0.5615918636322021),
 ('телепрограмма', 0.5530967712402344),
 ('соведущий', 0.5281647443771362),
 ('телепередача', 0.5098356604576111),
 ('колумнист', 0.5057438611984253),
 ('телеигра', 0.4916815757751465),
 ('модератор', 0.478797972202301),
 ('телекритика', 0.47347599267959595),
 ('рубрика', 0.4733657240867615),
 ('медиапроект', 0.47168201208114624)]

In [None]:
save_path = './workdir/models/model_fasttext_compounts_1'
model_fasttext = gensim.models.FastText(full_data, size=300, window=5, min_count=2, workers=6)
model_fasttext.save(save_path)