# Установка зависимостей spacy
В Google Colab *spacy* предустановлен, но нам нужно его обновить до версии >= 3.0.*

Также тут краткая проверка работоспособности установленных моделей

In [1]:
!pip install --upgrade spacy
!python -m spacy download ru_core_news_sm
#!python -m spacy download ru_core_news_md
!python -m spacy download en_core_web_sm

Collecting ru-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0-py3-none-any.whl (16.4 MB)
[K     |████████████████████████████████| 16.4 MB 33 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
!git clone -b v2.1 https://github.com/buriy/spacy-ru.git
!cp -r ./spacy-ru/ru2/ ./ru2/

fatal: destination path 'spacy-ru' already exists and is not an empty directory.


In [3]:
import spacy
#import ru2
from spacy import displacy

#nlp_ru = spacy.load('ru_core_news_sm')
nlp_ru = spacy.load('ru_core_news_md')

print("Russian pipeline: {}".format(nlp_ru.pipe_names))

Russian pipeline: ['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
import spacy
from spacy import displacy

nlp_en = spacy.load("en_core_web_sm")
print("English pipeline: {}".format(nlp_en.pipe_names))

English pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [5]:
example_en = "Some of the strongest critics of our welfare system are the people who have become dependent on it ."
doc_en = nlp_en(example_en)

for token in doc_en:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, list(token.children))

Some some PRON DT nsubj Xxxx True True [of]
of of ADP IN prep xx True True [critics]
the the DET DT det xxx True True []
strongest strong ADJ JJS amod xxxx True False []
critics critic NOUN NNS pobj xxxx True False [the, strongest, of]
of of ADP IN prep xx True True [system]
our our PRON PRP$ poss xxx True True []
welfare welfare NOUN NN compound xxxx True False []
system system NOUN NN pobj xxxx True False [our, welfare]
are be AUX VBP ROOT xxx True True [Some, people, .]
the the DET DT det xxx True True []
people people NOUN NNS attr xxxx True False [the, become]
who who PRON WP nsubj xxx True True []
have have AUX VBP aux xxxx True True []
become become VERB VBN relcl xxxx True True [who, have, dependent]
dependent dependent ADJ JJ acomp xxxx True False [on]
on on ADP IN prep xx True True [it]
it it PRON PRP pobj xx True True []
. . PUNCT . punct . False False []


In [6]:
import pandas as pd

cols = ("text", "lemma", "POS", "explain_pos", "tag", "dep", "explain_dep",
        "shape", "stopword", "children")
rows = []

for t in doc_en:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.tag_, t.dep_, 
           spacy.explain(t.dep_), t.shape_, t.is_stop, list(t.children)]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df

Unnamed: 0,text,lemma,POS,explain_pos,tag,dep,explain_dep,shape,stopword,children
0,Some,some,PRON,pronoun,DT,nsubj,nominal subject,Xxxx,True,[of]
1,of,of,ADP,adposition,IN,prep,prepositional modifier,xx,True,[critics]
2,the,the,DET,determiner,DT,det,determiner,xxx,True,[]
3,strongest,strong,ADJ,adjective,JJS,amod,adjectival modifier,xxxx,False,[]
4,critics,critic,NOUN,noun,NNS,pobj,object of preposition,xxxx,False,"[the, strongest, of]"
5,of,of,ADP,adposition,IN,prep,prepositional modifier,xx,True,[system]
6,our,our,PRON,pronoun,PRP$,poss,possession modifier,xxx,True,[]
7,welfare,welfare,NOUN,noun,NN,compound,compound,xxxx,False,[]
8,system,system,NOUN,noun,NN,pobj,object of preposition,xxxx,False,"[our, welfare]"
9,are,be,AUX,auxiliary,VBP,ROOT,,xxx,True,"[Some, people, .]"


Тест chunk'ов

Сразу стоит отметить, что они реализованы для английского языка, но для русского языка их реализации нет

In [7]:
chunks_cols = ['Chunk', '.root', 'root.dep_', '.root.head']
chunks_rows = []
for chunk in doc_en.noun_chunks:
    chunks_rows.append([
        chunk,            # A Span object with the full phrase.
        chunk.root,       # The key Token within this phrase.
        chunk.root.dep_,  # The grammatical role of this phrase.
        chunk.root.head   # The grammatical parent Token.
    ])

chunk_df = pd.DataFrame(chunks_rows, columns=chunks_cols)

chunk_df

Unnamed: 0,Chunk,.root,root.dep_,.root.head
0,(Some),Some,nsubj,are
1,"(the, strongest, critics)",critics,pobj,of
2,"(our, welfare, system)",system,pobj,of
3,"(the, people)",people,attr,are
4,(who),who,nsubj,become
5,(it),it,pobj,on


In [8]:
displacy.render(doc_en, style="dep", jupyter=True)

In [9]:
example_ru = "Некоторые из самых сильных критиков нашей системы социального обеспечения - это люди, которые стали зависимыми от нее ."
doc_ru = nlp_ru(example_ru)

for token in doc_ru:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, list(token.children))

Некоторые некоторый DET DET nsubj Xxxxx True False [критиков]
из из ADP ADP case xx True True []
самых самых ADJ ADJ amod xxxx True False []
сильных сильный ADJ ADJ amod xxxx True False [самых]
критиков критик NOUN NOUN nmod xxxx True False [из, сильных, системы]
нашей наш DET DET det xxxx True True []
системы система NOUN NOUN nmod xxxx True False [нашей, обеспечения]
социального социальный ADJ ADJ amod xxxx True False []
обеспечения обеспечение NOUN NOUN nmod xxxx True False [социального]
- - PUNCT PUNCT punct - False False []
это это PART PART nsubj xxx True True []
люди человек NOUN NOUN ROOT xxxx True False [Некоторые, -, это, стали, .]
, , PUNCT PUNCT punct , False False []
которые которые PRON PRON nsubj xxxx True True []
стали стать VERB VERB acl:relcl xxxx True False [,, которые, зависимыми]
зависимыми зависимый ADJ ADJ xcomp xxxx True False [нее]
от от ADP ADP case xx True True []
нее нее PRON PRON obl xxx True True [от]
. . PUNCT PUNCT punct . False False []


In [10]:
displacy.render(doc_ru, style="dep", jupyter=True)

# Установка зависимостей Google Translate

В этом скрипте используется Google Translate, так как к нему легче получить доступ и не обязательно запускать скрипты из окружения Yandex Cloud, как Yandex Translate API

In [11]:
!pip install -U deep-translator



In [12]:
from deep_translator import GoogleTranslator

proxies = {
    "https": "45.42.177.37:3128",
    "http": "45.42.177.37:3128"
}
translator = GoogleTranslator(source="en", target="ru", proxies=proxies)

print(translator.translate(example_en))
print(example_ru)

Одними из самых сильных критиков нашей системы социального обеспечения являются люди, которые стали от нее зависеть.
Некоторые из самых сильных критиков нашей системы социального обеспечения - это люди, которые стали зависимыми от нее .


# Загрузка данных
Сначала загрузим следующие 2 датасета:
1. Датасет Framenet 1.7 в формате Conll-09
2. Датасет переводов предложений Framenet, составленный с помощью Yandex Translate API

Также мы установим пакет nltk и загрузим в него данные framenet, так как nltk дает нам удобную программную оболочку для работы с Framenet.

In [13]:
import os
from google.colab import drive
drive.mount('/content/drive')

# Загрузка файлов, если их еще нет в текущем каталоге
if not os.path.exists('./fn1.7/'):
    if not os.path.exists('./fn.1.7.zip'):
        !cp "/content/drive/My Drive/Framenet/fn1.7.zip" .
    !unzip fn1.7.zip

if not os.path.exists('./fn1-7-yandex-translated/'):
    if not os.path.exists('./fn1-7-yandex-translated.zip'):
        !cp "/content/drive/My Drive/Framenet/fn1-7-yandex-translated.zip" .
    !unzip fn1-7-yandex-translated.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Парсинг данных
Здесь мы объединяем данные из Framenet с данными из файлов с переводами предложений

In [14]:
import json

framenet_parts = [
             {"name": "dev", 
              "conll_filename": "/content/fn1.7/fn1.7.dev.syntaxnet.conll",
              "translated_filename": "/content/fn1-7-yandex-translated/fn1-7-dev-yandex-translated-russian-sents.json"}
]

def parse_framenet(conll_filename, translated_filename, conll_sep="\t"):
    with open(conll_filename, "r", encoding="utf-8") as f_conll, open(translated_filename, "r", encoding="utf-8") as f_translated:
        result = []
        sentence = []
        lu_labels = []
        frame_labels = []
        fe_labels = []
        frame_example_number = -1

        translated = json.load(f_translated)

        for line in f_conll:
            if line == "" or line == "\n":
                if sentence:
                    frame_example = {
                        "en_tokenized": sentence,
                        "en": translated[str(frame_example_number)]["en"],
                        "ru": translated[str(frame_example_number)]["ru"],
                        "lu_labels": lu_labels,
                        "frame_labels": frame_labels,
                        "fe_labels": fe_labels
                    }
                    result.append(frame_example)

                    sentence = []
                    lu_labels = []
                    frame_labels = []
                    fe_labels = []
                    frame_example_number = -1
                    continue
            
            #print(line)
            cols = line.strip().split(conll_sep)
            #print(len(cols))
            assert len(cols) >= 15

            sentence.append(cols[1][2:-1])

            if frame_example_number == -1:
                frame_example_number = cols[6]
            else:
                assert cols[6] == frame_example_number

            lu_labels.append(cols[12])
            frame_labels.append(cols[13])
            fe_labels.append(cols[14])

    return result

for part in framenet_parts:
    parsed = parse_framenet(part["conll_filename"], part["translated_filename"])
    part["parsed"] = parsed

#framenet_parts[0]["parsed"][0]

# Вспомогательные функции для преобразования корпуса

In [20]:
def localize_frame_indexes(parsed, inplace=False):
    frame_labels = parsed["frame_labels"]
    fe_labels = parsed["fe_labels"]

    result = []

    for i in range(len(fe_labels)):
        if fe_labels[i] != "O":
            result.append(i)
        elif frame_labels[i] != "_":
            result.append(i)
    
    if inplace:
        parsed["local_indexes"] = result
    return result

def list_indexes(token):
    to_parse = [token]
    i = 0
    result = []

    while i < len(to_parse):
        #print(i, to_parse)
        cur = to_parse[i]

        if cur.i in result:
            i += 1
            continue
        else:
            result.append(cur.i)
            
            for child in cur.children:
                to_parse.append(child)
            
            i += 1
    
    result.sort()
    return result

def find_root(syntax):
    root_i = 0
    for t in syntax:
        if t.dep_ == "ROOT":
            break
        root_i += 1
    return syntax[root_i]

def search_group_localization_by_syntax(parsed, group_indexes, syntax, root=None):
    if not root:
        root = find_root(syntax)
    
    prev = root
    cur = root
    transitions = []

    group_indexes_set = set(group_indexes)

    while True:
        prev = cur
        for child in cur.children:
            #print(group_indexes_set, list_indexes(child), group_indexes_set <= set(list_indexes(child)))
            if group_indexes_set <= set(list_indexes(child)):
                cur = child
                transitions.append(cur.dep_)
                break
        
        #print(cur, prev)
        if cur == prev:
            break
    
    return transitions, list_indexes(cur)

example_parsed = {'en': 'Some of the strongest critics of our welfare system are the people who have become dependent on it .',
                  'en_tokenized': ['Some','of','the','strongest','critics','of','our','welfare','system','are','the','people','who','have','become','dependent','on','it','.'],
                  'fe_labels': ['S-Subset','O','B-Group','I-Group','I-Group','I-Group','I-Group','I-Group','I-Group','O','O','O','O','O','O','O','O','O','O'],
                  'frame_labels': ['_','Partitive','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_'],
                  'lu_labels': ['_','of.prep','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_','_'],
                  'ru': 'Некоторые из самых сильных критиков нашей системы социального обеспечения - это люди, которые стали зависимыми от нее.'}

local_indexes = localize_frame_indexes(example_parsed)
search_group_localization_by_syntax(parsed, local_indexes, doc_en)

of Some
critics of
the critics
strongest critics
of critics
system of
our system
welfare system
critics of
the critics
strongest critics
of critics
system of
our system
welfare system
of Some
critics of
the critics
strongest critics
of critics
system of
our system
welfare system


(['nsubj'], [0, 1, 2, 3, 4, 5, 6, 7, 8])