In [1]:
!pip install ufal.udpipe
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ufal.udpipe
  Downloading ufal.udpipe-1.2.0.3.tar.gz (304 kB)
[K     |████████████████████████████████| 304 kB 13.2 MB/s 
[?25hBuilding wheels for collected packages: ufal.udpipe
  Building wheel for ufal.udpipe (setup.py) ... [?25l[?25hdone
  Created wheel for ufal.udpipe: filename=ufal.udpipe-1.2.0.3-cp37-cp37m-linux_x86_64.whl size=5626656 sha256=24dacf749fe76116b74f8428aa0c28b293e3ed77ddd3c6b4fd21d5be22483224
  Stored in directory: /root/.cache/pip/wheels/b8/b5/8e/3da091629a21ce2d10bf90759d0cb034ba10a5cf7a01e83d64
Successfully built ufal.udpipe
Installing collected packages: ufal.udpipe
Successfully installed ufal.udpipe-1.2.0.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?

In [2]:
import wget
import sys

udpipe_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
text_url = 'https://rusvectores.org/static/henry_sobolya.txt'

modelfile = wget.download(udpipe_url)
textfile = wget.download(text_url)

In [3]:
def process(pipeline, text='Строка', keep_pos=True, keep_punct=False):
    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]

    for t in tagged:
        if len(t) != 10:
            continue
        (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        if not lemma or not token:
            continue
        if pos in entities:
            if '|' not in feats:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
            if 'Case' not in morph or 'Number' not in morph:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            if not named:
                named = True
                mem_case = morph['Case']
                mem_number = morph['Number']
            if morph['Case'] == mem_case and morph['Number'] == mem_number:
                memory.append(lemma)
                if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
                    named = False
                    past_lemma = '::'.join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + '_PROPN ')
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN ')
                tagged_propn.append('%s_%s' % (lemma, pos))
        else:
            if not named:
                if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
                    lemma = num_replace(token)
                tagged_propn.append('%s_%s' % (lemma, pos))
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN ')
                tagged_propn.append('%s_%s' % (lemma, pos))

    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return tagged_propn

In [4]:
from ufal.udpipe import Model, Pipeline
import os
import re

def tag_ud(text='Текст нужно передать функции в виде строки!', modelfile='udpipe_syntagrus.model'):
    udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
    udpipe_filename = udpipe_model_url.split('/')[-1]

    if not os.path.isfile(modelfile):
        print('UDPipe model not found. Downloading...', file=sys.stderr)
        wget.download(udpipe_model_url)

    print('\nLoading the model...', file=sys.stderr)
    model = Model.load(modelfile)
    process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

    print('Processing input...', file=sys.stderr)
    lines = text.split('\n')
    tagged = []
    for line in lines:
        # line = unify_sym(line.strip()) # здесь могла бы быть ваша функция очистки текста
        output = process(process_pipeline, text=line)
        tagged_line = ' '.join(output)
        tagged.append(tagged_line)
    return '\n'.join(tagged)

In [11]:
text = open(textfile, 'r', encoding='utf-8').read()
processed_text = tag_ud(text=text, modelfile=modelfile)
print(processed_text[:350])
with open('my_text.txt', 'w', encoding='utf-8') as out:
    out.write(processed_text)


Loading the model...
Processing input...


русский_PROPN  соболь_NOUN о.::генри_PROPN 
когда_SCONJ синий_ADJ как_SCONJ ночь_NOUN глаз_NOUN Молли_VERB Мак-Кивер_PROPN  класть_VERB малыш::Брэди_PROPN  на_ADP оба_NUM лопатка_NOUN он_PRON вынужденный_ADJ быть_AUX покидать_VERB ряд_NOUN банда_NOUN «Дымовый_ADJ труба»_NOUN таков_ADJ власть_NOUN нежный_ADJ укор_NOUN подружка_NOUN и_CCONJ она_PRON 


In [12]:
!pip install gensim
!pip install gensim --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import sys
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [19]:
f = 'my_text.txt'
data = gensim.models.word2vec.LineSentence(f)

In [20]:
#model = gensim.models.Word2Vec(data, size=500, window=10, min_count=2, sg=0)
model = gensim.models.Word2Vec(data,vector_size=100, window=5, min_count=1, workers=4)

In [21]:
model.init_sims(replace=True)

  """Entry point for launching an IPython kernel.


In [22]:
print(len(model.wv.vocab))

AttributeError: ignored

In [23]:
model.save('my.model')

In [24]:
import zipfile
model_url = 'http://vectors.nlpl.eu/repository/20/180.zip'
m = wget.download(model_url)
model_file = model_url.split('/')[-1]
with zipfile.ZipFile(model_file, 'r') as archive:
   stream = archive.open('model.bin')
   model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [25]:
words = ['день_NOUN', 'ночь_NOUN', 'человек_NOUN', 'семантика_NOUN', 'студент_NOUN', 'студент_ADJ']

In [26]:
for word in words:
   # есть ли слово в модели? Может быть, и нет
   if word in model:
       print(word)
       # выдаем 10 ближайших соседей слова:
       for i in model.most_similar(positive=[word], topn=10):
           # слово + коэффициент косинусной близости
           print(i[0], i[1])
       print('\n')
   else:
       # Увы!
       print(word + ' is not present in the model')

день_NOUN
неделя_NOUN 0.7375996112823486
день_PROPN 0.706766664981842
месяц_NOUN 0.7037326097488403
час_NOUN 0.6643950939178467
утро_NOUN 0.6526744961738586
вечер_NOUN 0.6038411259651184
сутки_NOUN 0.5923080444335938
воскресенье_NOUN 0.5842781066894531
полдень_NOUN 0.5743688344955444
суббота_NOUN 0.5345946550369263


ночь_NOUN
ночь_PROPN 0.8310787081718445
вечер_NOUN 0.7183678150177002
рассвет_NOUN 0.6965947151184082
ночи_NOUN 0.692021906375885
полночь_NOUN 0.6704976558685303
ночь_VERB 0.6615265011787415
утро_NOUN 0.6263936161994934
ночной_ADJ 0.6024709343910217
полдень_NOUN 0.5835085511207581
сумерки_NOUN 0.5671443939208984


человек_NOUN
человек_PROPN 0.7850059270858765
человеческий_ADJ 0.5915265679359436
существо_NOUN 0.5736929774284363
народ_NOUN 0.5354466438293457
личность_NOUN 0.5296981930732727
человечество_NOUN 0.5282931327819824
человкъ_PROPN 0.5047001242637634
индивидуум_NOUN 0.5000404119491577
нравственный_ADJ 0.4972919225692749
потому_ADV 0.49293625354766846


семантика_NOU

In [27]:
model.most_similar(positive=['студенческий_ADJ'], topn=10)

[('университетский_ADJ', 0.6642225980758667),
 ('студент_NOUN', 0.6486333012580872),
 ('студенчество_NOUN', 0.6344770789146423),
 ('институтский_ADJ', 0.6142880320549011),
 ('гимназический_ADJ', 0.5510081648826599),
 ('аспирантский_ADJ', 0.5403808951377869),
 ('школьный_ADJ', 0.5198260545730591),
 ('студентский_ADJ', 0.500437319278717),
 ('ифли_PROPN', 0.48894086480140686),
 ('молодежный_ADJ', 0.4792458415031433)]

In [28]:
print(model.similarity('искусство_NOUN', 'театр_NOUN'))

0.41525292


In [29]:
print(model.similarity('сосна_NOUN', 'тайга_NOUN'))

0.4646158


In [30]:
print(model.similarity('готовить_VERB', 'рисовать_VERB'))

0.22633684


In [31]:
print(model.similarity('программировать_VERB', 'учиться_VERB'))

0.12188321


In [32]:
print(model.similarity('смотреть_VERB', 'слушать_VERB'))

0.4513976


In [33]:
print(model.similarity('смотреть_VERB', 'анализировать_VERB'))

0.11082313


In [34]:
print(model.similarity('комментировать_VERB', 'анализировать_VERB'))

0.42432678


In [35]:
my_obj = [
  {
    "name": "Наука",
    "en": "science",
    "ru": "наука_NOUN",
    "id" : 1
  },
  {
    "name" : "Физика",
    "en": "physics",
    "ru": "физика_NOUN",
    "id" : 2
  },
  {
    "name" : "Механика",
    "en": "mechanics",
    "ru": "механика_NOUN",
    "id" : 3
  },
  {
    "name" : "Оптика",
    "en": "optics",
    "ru": "оптика_NOUN",
    "id" : 4
  },
  {
    "name" : "Атомная физика",
    "en": "nuclear",
    "ru": "атом_NOUN",
    "id" : 5
  },
  {
    "name" : "Химия",
    "en": "chemistry",
    "ru": "химия_NOUN",
    "id" : 6
  },
  {
    "name" : "IT",
    "en": "computer",
    "ru": "компьютер_NOUN",
    "id" : 7
  },
  {
    "name" : "Техника",
    "en": "technics",
    "ru": "техника_NOUN",
    "id" : 8
  },
  {
    "name" : "Транспорт",
    "en": "vehicles",
    "ru": "транспорт_NOUN",
    "id" : 10
  },
  {
    "name" : "Биология",
    "en": "biology",
    "ru": "биология_NOUN",
    "id" : 9
  },
  {
    "name" : "Культура",
    "en": "culture",
    "ru": "культура_NOUN",
    "id" : 11
  },
  {
    "name" : "Путешествия",
    "en": "travel",
    "ru": "путешествие_NOUN",
    "id" : 12
  },
  {
    "name" : "Красота и стиль",
    "en": "beauty",
    "ru": "красота_NOUN",
    "id" : 13
  },
  {
    "name" : "Литература",
    "en": "literature",
    "ru": "литература_NOUN",
    "id" : 14
  },
  {
    "name" : "История",
    "en": "history",
    "ru": "история_NOUN",
    "id" : 15
  },
  {
    "name" : "Спорт",
    "en": "sport",
    "ru": "спорт_NOUN",
    "id" : 16
  },
  {
    "name" : "Игровые виды спорта",
    "en": "game",
    "ru": "игра_NOUN",
    "id" : 17
  },
  {
    "name" : "Водные виды спорта",
    "en": "watter",
    "ru": "вода_NOUN",
    "id" : 18
  },
  {
    "name" : "Силовые виды спорта",
    "en": "strong",
    "ru": "сила_NOUN",
    "id" : 19
  },
  {
    "name" : "Скоростные виды спорта",
    "en": "speed",
    "ru": "скорость_NOUN",
    "id" : 20
  },
  {
    "name" : "Экстримальные виды спорта",
    "en": "extreme",
    "ru": "экстрим_NOUN",
    "id" : 21
  },
  {
    "name" : "Природа",
    "en": "nature",
    "ru": "природа_NOUN",
    "id" : 22
  },
  {
    "name" : "Растения",
    "en": "plants",
    "ru": "растение_NOUN",
    "id" : 23
  },
  {
    "name" : "Животные",
    "en": "animals",
    "ru": "животное_NOUN",
    "id" : 24
  },
  {
    "name" : "Промышленность",
    "en": "industry",
    "ru": "промышленность_NOUN",
    "id" : 25
  },
  {
    "name" : "Театр",
    "en": "theatre",
    "ru": "театр_NOUN",
    "id" : 26
  },
  {
    "name" : "Изобразительное искусство",
    "en": "drawing",
    "ru": "живопись_NOUN",
    "id" : 27
  },
  {
    "name" : "Экология",
    "en": "ecology",
    "ru": "экология_NOUN",
    "id" : 28
  },
  {
    "name" : "Кулинария",
    "en": "cooking",
    "ru": "кулинария_NOUN",
    "id" : 29
  },
  {
    "name" : "Сельское хозяйство",
    "en": "agriculture",
    "ru": "ферма_NOUN",
    "id" : 30
  }
]

In [36]:
sims = {}
for i in range(len(my_obj)):
  sims[ my_obj[i]["id"] ] = {}
  for j in range(len(my_obj)):
    #if (i!=j):
    similarity = model.similarity(my_obj[i]["ru"],my_obj[j]["ru"])
      #print(my_obj[i]["id"]," ",my_obj[j]["id"]," similarity: ",similarity)
    sims[ my_obj[i]["id"] ][ my_obj[j]["id"] ] = similarity

print(sims)

{1: {1: 1.0, 2: 0.58764815, 3: 0.38770643, 4: 0.31560177, 5: 0.28661126, 6: 0.5934481, 7: 0.12585382, 8: 0.41234085, 10: 0.060883194, 9: 0.5835238, 11: 0.51078004, 12: 0.06746661, 13: 0.16097689, 14: 0.5458134, 15: 0.45124763, 16: 0.29877037, 17: 0.1378778, 18: -0.07412772, 19: 0.3168469, 20: 0.086228736, 21: -0.010374344, 22: 0.45379546, 23: 0.27813253, 24: 0.28132042, 25: 0.36196136, 26: 0.121654354, 27: 0.36600143, 28: 0.35091722, 29: 0.24323474, 30: -0.048139486}, 2: {1: 0.58764815, 2: 1.0, 3: 0.5678631, 4: 0.5018071, 5: 0.45946503, 6: 0.67668563, 7: 0.24392152, 8: 0.31593117, 10: 0.003546161, 9: 0.61333483, 11: 0.24443063, 12: 0.10159264, 13: 0.021619715, 14: 0.33596995, 15: 0.2983287, 16: 0.2277565, 17: 0.105771676, 18: -6.667815e-05, 19: 0.14366087, 20: 0.28285974, 21: -0.016764108, 22: 0.26229766, 23: 0.18367285, 24: 0.08925478, 25: 0.19684327, 26: 0.08992757, 27: 0.2613998, 28: 0.27357346, 29: 0.18545361, 30: 0.01831445}, 3: {1: 0.38770643, 2: 0.5678631, 3: 1.0, 4: 0.5220153, 