## Preparing UDPipe

In [0]:
!wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2998/russian-syntagrus-ud-2.4-190531.udpipe?sequence=74&amp;isAllowed=y
!mv russian-syntagrus-ud-2.4-190531.udpipe?sequence=74 russian-syntagrus-ud-2.4-190531.udpipe

/bin/bash: amp: command not found
--2019-12-24 23:02:05--  https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2998/russian-syntagrus-ud-2.4-190531.udpipe?sequence=74
Resolving lindat.mff.cuni.cz (lindat.mff.cuni.cz)... 195.113.20.140
Connecting to lindat.mff.cuni.cz (lindat.mff.cuni.cz)|195.113.20.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 45859472 (44M) [application/octet-stream]
Saving to: ‘russian-syntagrus-ud-2.4-190531.udpipe?sequence=74’


2019-12-24 23:02:14 (6.72 MB/s) - ‘russian-syntagrus-ud-2.4-190531.udpipe?sequence=74’ saved [45859472/45859472]



In [0]:
!pip install ufal.udpipe

Collecting ufal.udpipe
[?25l  Downloading https://files.pythonhosted.org/packages/e5/72/2b8b9dc7c80017c790bb3308bbad34b57accfed2ac2f1f4ab252ff4e9cb2/ufal.udpipe-1.2.0.3.tar.gz (304kB)
[K     |█                               | 10kB 16.0MB/s eta 0:00:01[K     |██▏                             | 20kB 1.8MB/s eta 0:00:01[K     |███▎                            | 30kB 2.4MB/s eta 0:00:01[K     |████▎                           | 40kB 1.7MB/s eta 0:00:01[K     |█████▍                          | 51kB 1.9MB/s eta 0:00:01[K     |██████▌                         | 61kB 2.3MB/s eta 0:00:01[K     |███████▌                        | 71kB 2.5MB/s eta 0:00:01[K     |████████▋                       | 81kB 2.7MB/s eta 0:00:01[K     |█████████▊                      | 92kB 3.0MB/s eta 0:00:01[K     |██████████▊                     | 102kB 2.8MB/s eta 0:00:01[K     |███████████▉                    | 112kB 2.8MB/s eta 0:00:01[K     |█████████████                   | 122kB 2.8MB/s eta 0:

In [0]:
import ufal.udpipe

class UDPipeModel:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

In [0]:
class Word:

  def __init__(self, conllu_str):
    if not conllu_str:
      raise ValueError("A valid word parse in conllu format should be passed.")
    if conllu_str[0] not in "0123456789":
      raise ValueError("A valid word parse in conllu format should be passed.")
    conllu_str = conllu_str.split("\t")
    self.position = int(conllu_str[0])
    self.token = conllu_str[1]
    self.lower = self.token.lower
    self.lemma = conllu_str[2]
    self.POS = conllu_str[3]
    self.pos = self.POS
    self.parse = None
    if "=" in conllu_str[5]:
      self.parse = {var.split("=")[0]: var.split("=")[1] for var in conllu_str[5].split("\t")}
    if conllu_str[6] == "0":
      self.head = None
    else:
      self.head = int(conllu_str[6])
    self.dep = conllu_str[7]
    self.space_after = True
    self.spaces_before = ""
    if conllu_str[9] != "_":
      space_dict = {var.split("=")[0]: var.split("=")[1] for var in conllu_str[9].split("\t")}
      if "SpaceAfter" in space_dict:
        if space_dict["SpaceAfter"] == "No":
          self.space_after = False
      if "SpacesBefore" in space_dict:
        self.spaces_before = space_dict["SpacesBefore"].decode("string_escape")

  def __repr__(self):
    return self.token

In [0]:
from collections import OrderedDict

class Model:

  def __init__(self, udpipemodel):
    self.model = udpipemodel
  
  def make_conllu(self, text):
    sentences = self.model.tokenize(text)
    for s in sentences:
        self.model.tag(s)
        self.model.parse(s)
    conllu = self.model.write(sentences, "conllu")
    return conllu

  def parse_words(self, text):
    conllu = self.make_conllu(text)
    suitstrs = [s for s in conllu.split("\n") if s]
    suitstrs = [s for s in suitstrs if s[0] in "0123456789"]
    parse_res = [Word(suitstr) for suitstr in suitstrs]
    return parse_res

  def parse_text(self, text):
    conllu = self.make_conllu(text)
    sent_dict = OrderedDict()
    sents = [sent for sent in conllu.split("# sent_id = ") if sent[0] in "0123456789"]
    for sent in sents:
      sent_num = int(sent[:sent.find("\n")])
      word_strs = [s for s in sent.split("\n") if s][1:]
      words = [Word(s) for s in word_strs if s[0] in "0123456789"]
      sent_dict[sent_num] = OrderedDict()
      for word in words:
        sent_dict[sent_num][word.position-1] = word
    return sent_dict

In [0]:
ud_model = Model(UDPipeModel('russian-syntagrus-ud-2.4-190531.udpipe'))

## Preparing spelling

In [0]:
!apt-get install swig
!easy_install-3.6 jamspell
!cp -r /usr/local/lib/python3.6/dist-packages/jamspell-0.0.11-py3.6-linux-x86_64.egg/EGG-INFO /usr/local/lib/python3.6/dist-packages/jamspell-0.0.11-py3.6-linux-x86_64.egg-info
!cp -r /usr/local/lib/python3.6/dist-packages/jamspell-0.0.11-py3.6-linux-x86_64.egg /usr/local/lib/python3.6/dist-packages/jamspell
!wget https://github.com/bakwc/JamSpell-models/raw/master/ru.tar.gz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 3s (419 kB/s)
Selecting previously unselected package swig3.0.
(Reading database ... 135004 files and directories currently installed.)
Preparing to unpack .../swig3.0_3.0.12-1_amd64.deb ...
Unpacki

In [0]:
import tarfile

tar = tarfile.open("ru.tar.gz")
tar.extractall()

In [0]:
from jamspell import jamspell

class Jamspell:
  
  def __init__(self, path):
    self.instance = jamspell.TSpellCorrector()
    self.instance.LoadLangModel(path)
	
  def correct(self, text):
    return self.instance.FixFragment(text)

In [0]:
spellchecker = Jamspell('ru_small.bin')

## Preparing vectors

In [0]:
!wget 'http://vectors.nlpl.eu/repository/11/185.zip'
!unzip '185.zip'

--2019-12-24 23:05:07--  http://vectors.nlpl.eu/repository/11/185.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 639268530 (610M) [application/zip]
Saving to: ‘185.zip’


2019-12-24 23:06:19 (8.61 MB/s) - ‘185.zip’ saved [639268530/639268530]

Archive:  185.zip
  inflating: README                  
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               


In [0]:
!wget https://www.dropbox.com/s/q1w4ftavn69383c/mean_vectors.pk?dl=0
!mv mean_vectors.pk?dl=0 mean_vectors.pk

--2019-12-24 23:06:37--  https://www.dropbox.com/s/q1w4ftavn69383c/mean_vectors.pk?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.1, 2620:100:6032:1::a27d:5201
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/q1w4ftavn69383c/mean_vectors.pk [following]
--2019-12-24 23:06:37--  https://www.dropbox.com/s/raw/q1w4ftavn69383c/mean_vectors.pk
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc0cb645cc2bd8a883d6af9b8b99.dl.dropboxusercontent.com/cd/0/inline/Au3vs6uHNXM3AQVA9NIy18KeD2Xj6Bxqe0910MhnYk6JOEOrvFxehySjPnRsMkM9sy9FFzPHKw9P7vaZPn6XrRhKLHq7xaicAeRR_ALj6MMm2rjz3G7t8IrqYHrSLh1NOvU/file# [following]
--2019-12-24 23:06:37--  https://uc0cb645cc2bd8a883d6af9b8b99.dl.dropboxusercontent.com/cd/0/inline/Au3vs6uHNXM3AQVA9NIy18KeD2Xj6Bxqe0910MhnYk6JOEOrvFxehySjPnRsMkM9sy9FFzPHKw9P7vaZPn6XrRhKLHq7xaicA

In [0]:
from gensim.models.keyedvectors import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('model.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Downloading wordlists

In [0]:
!wget https://www.dropbox.com/s/x6yiaveef5snf94/lexicon_by_parts.zip?dl=0
!mv lexicon_by_parts.zip?dl=0 lexicon_by_parts.zip
!unzip lexicon_by_parts.zip

--2019-12-24 23:06:55--  https://www.dropbox.com/s/x6yiaveef5snf94/lexicon_by_parts.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.1, 2620:100:6032:1::a27d:5201
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/x6yiaveef5snf94/lexicon_by_parts.zip [following]
--2019-12-24 23:06:56--  https://www.dropbox.com/s/raw/x6yiaveef5snf94/lexicon_by_parts.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc60886dd47cfc5fb06002622b40.dl.dropboxusercontent.com/cd/0/inline/Au2ovrhM5fzmN4kxz_cGJfLeFHkxYHlX3C4_wfnCcbqyVwV0gxGr-K_bkbsnFg1-yPXrn_I1UJpZn1gnNfqvTcbUzJ8ZMxPxvP0qCX0Y6u7aqSNCjPAQhBRQgno9yilG1w4/file# [following]
--2019-12-24 23:06:56--  https://uc60886dd47cfc5fb06002622b40.dl.dropboxusercontent.com/cd/0/inline/Au2ovrhM5fzmN4kxz_cGJfLeFHkxYHlX3C4_wfnCcbqyVwV0gxGr-K_bkbsnFg1-yPXrn_I1UJpZn1gnNf

## Building evaluator pipeline

In [0]:
import os
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
class Evaluator:

  def load_wordlist_uni(self, path):
    with open(path, "r", encoding="utf-8") as inf:
      wordset = set([w.split("\t")[1] for w in inf.read().split("\n") if w])
    return wordset
  
  def load_wordlist_bi(self, path):
    with open(path, "r", encoding="utf-8") as inf:
      wordset = set([eval(w.split("\t")[1]) for w in inf.read().split("\n") if w])
    return wordset

  def __init__(self, ud_model, w2v_model, spellchecker, lists_path, vectors_path):
    self._upch = 200
    self.food_0_uni = self.load_wordlist_uni(os.path.join(lists_path, "unigrams_food_negative.txt"))
    self.food_1_uni = self.load_wordlist_uni(os.path.join(lists_path, "unigrams_food_positive.txt"))
    self.food_0_bi = self.load_wordlist_bi(os.path.join(lists_path, "bigrams_food_negative.txt"))
    self.food_1_bi = self.load_wordlist_bi(os.path.join(lists_path, "bigrams_food_positive.txt"))
    self.service_0_uni = self.load_wordlist_uni(os.path.join(lists_path, "unigrams_service_negative.txt"))
    self.service_1_uni = self.load_wordlist_uni(os.path.join(lists_path, "unigrams_service_positive.txt"))
    self.service_0_bi = self.load_wordlist_bi(os.path.join(lists_path, "bigrams_service_negative.txt"))
    self.service_1_bi = self.load_wordlist_bi(os.path.join(lists_path, "bigrams_service_positive.txt"))
    self.model = ud_model
    self.spellchecker = spellchecker
    self.w2v_model = w2v_model
    with open(vectors_path, 'rb') as f:
        mean_vectors_dict = pickle.load(f)

    self.pos_food = mean_vectors_dict['pos_food']
    self.neg_food = mean_vectors_dict['neg_food']
    self.pos_service = mean_vectors_dict['pos_service']
    self.neg_service = mean_vectors_dict['neg_service']

  def printstr(self, coded_str):
    sp = coded_str.split("\t")
    sent_id = ord(sp[0]) - self._upch
    word_id = ord(sp[1]) - self._upch
    out = "\t".join([str(sent_id), str(word_id), "\t".join(sp[2:])])
    return out

  def vector_comparison(self, mean_vec_spec, mean_vec_comp, word_lemma):
    try:
      grammeme_vec = self.w2v_model[word_lemma]
    except:
      grammeme_vec = np.zeros((1,300))
    grammeme_vec = grammeme_vec.reshape(1, -1)
    
    cos_sim_spec = cosine_similarity(mean_vec_spec.reshape(1, -1), grammeme_vec)[0][0]
    cos_sim_comp = cosine_similarity(mean_vec_comp.reshape(1, -1), grammeme_vec)[0][0]
    
    if cos_sim_spec > 0.5 and (cos_sim_spec - cos_sim_comp) > 0.2:
      result = True
    else:
      result = False
    return result

  def process_uni(self, word):
    out = []
    grammeme = word.lemma + "_" + word.pos
    if grammeme in self.food_0_uni or self.vector_comparison(self.neg_food, self.pos_food, grammeme):
      out.append(chr(self._upch + word.position) + "\tFood\t0")
    if grammeme in self.food_1_uni or self.vector_comparison(self.pos_food, self.neg_food, grammeme):
      out.append(chr(self._upch + word.position) + "\tFood\t1")
    if grammeme in self.service_0_uni or self.vector_comparison(self.neg_service, self.pos_service, grammeme):
      out.append(chr(self._upch + word.position) + "\tService\t0")
    if grammeme in self.service_1_uni or self.vector_comparison(self.pos_service, self.neg_service, grammeme):
      out.append(chr(self._upch + word.position) + "\tService\t1")
    
    return out

  def process_bi(self, bigram):
    out = []
    gramm_pair = (bigram[0].lemma + "_" + bigram[0].pos, bigram[1].lemma + "_" + bigram[1].pos)
    if gramm_pair in self.food_0_bi:
      out.append(chr(self._upch + bigram[0].position) + "\tFood\t0")
      out.append(chr(self._upch + bigram[1].position) + "\tFood\t0")
    if gramm_pair in self.food_1_bi:
      out.append(chr(self._upch + bigram[0].position) + "\tFood\t1")
      out.append(chr(self._upch + bigram[1].position) + "\tFood\t1")
    if gramm_pair in self.service_0_bi:
      out.append(chr(self._upch + bigram[0].position) + "\tService\t0")
      out.append(chr(self._upch + bigram[1].position) + "\tService\t0")
    if gramm_pair in self.service_1_bi:
      out.append(chr(self._upch + bigram[0].position) + "\tService\t1")
      out.append(chr(self._upch + bigram[1].position) + "\tService\t1")
    return out

  def process_text(self, text):
    text = self.spellchecker.correct(text)
    text_parse = self.model.parse_text(text)
    out = []
    for sent_id in text_parse:
      SID = chr(self._upch + sent_id)
      sent = text_parse[sent_id]
      out += [SID + "\t" + res for res in self.process_uni(sent[0])]
      bigrams = [(sent[i], sent[i+1]) for i in range(len(sent) - 1)]
      for bigram in bigrams:
          out += [SID + "\t" + res for res in self.process_bi(bigram)]
          out += [SID + "\t" + res for res in self.process_uni(bigram[1])]
    out = sorted(list(set(out)))
    outstr = "\n".join(self.printstr(s) for s in out)
    return outstr

In [0]:
evaluator = Evaluator(ud_model, w2v_model, spellchecker, './lexicon_by_parts', 'mean_vectors.pk')

## Predicting!

In [0]:
sample_text = """— Тут был хороший бо-орщь, с капусткой, но не красный. Так... Сасисачки. Ну, ещё есть какой-то непонятный салаД, куда крошаД морковку, капусту и яблоки с ананасами (смеётся). Вообще он меня бесит. Вот... Эщо чоо... Вкусный чай, он так уталяет жажду (машет руками), я чувствую себя человеком! Вот. Всё.
— (голос за кадром) Как фамилия ваша?
— Я Никита Литвинков. (реверанс)
— (голос за кадром) Спасибо..."""

In [0]:
print(evaluator.process_text(sample_text))

1	3	Service	1
1	4	Service	1
8	1	Food	1
11	15	Service	1
