# Теги и словарь

In [0]:
from nltk.corpus import wordnet as wn

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
!pip install pptree



In [0]:
from pptree import *

#Model

In [0]:
!python -m deeppavlov install ner_ontonotes

2018-12-10 09:16:51.373 INFO in 'deeppavlov.core.common.file'['file'] at line 31: Interpreting 'ner_ontonotes' as '/usr/local/lib/python3.6/dist-packages/deeppavlov/configs/ner/ner_ontonotes.json'


In [0]:
levels = [['entity']]
levels.append(['physical entity', 'abstract entity'])
levels.append(['physical object', 'physical process', 'psychological feature', 'attribute', 'measure'])
levels.append(['geological formation', 'land', 'location', 'living thing', 'natural object', 'artifact'] +
        ['economic process', 'human process', 'industrial process', 'natural process', 'organic process'] +
        ['cognition', 'motivation', 'event', 'time', 'property', 'quality'] +
        ['value', 'volume', 'time unit', 'time interval'])

In [0]:
import random

In [0]:
class Node:
    def __init__(self, synset, parent=None):
        self.children = []
        if parent:
            parent.children.append(self)
        self.parent = parent
        self.synset = synset
       
    def __str__(self):
        return ', '.join(self.synset.lemma_names() )#[0]
    
# листья вместо synset хранят в dict имена своего родителя, 
# имена всех его детей, детей их детей и тд
class Leave:     
    def __init__(self, tag_dict, parent):
        self.children = []
        parent.children = [self]
        self.parent = parent
        self.dict = tag_dict
        
    def __str__(self):
        return '**leave: ' + str(len(self.dict))#', '.join(list(self.dict))
    
def hypo_closure(synset):
    return synset.closure(hypo)

def add_child(parent, synset):
    child = Node(synset, parent)
    return child

def add_dict_child(parent):           
    # заполняем dict
    hypo_clsr = list(hypo_closure(parent.synset))
    child_names = []
    
    for hyponym in hypo_clsr:
        child_names += hyponym.lemma_names()
        
    # добавляем имена самого родителя: 
    # в dict лежат все слова, относящиеся к тегу-родителю
    child_names += parent.synset.lemma_names()
    tag_dict = set(child_names)
    child = Leave(tag_dict, parent)
        
    return child

In [0]:
hypo = lambda s: s.hyponyms()
hyper = lambda s: s.hypernyms()

class Hierarchy:
    
    def __init__(self, levels, root_synset):
        self.levels = levels
        self.leaves = []
        self.max_depth = len(levels) - 1
        self.root_synset = root_synset
        self.root = Node(root_synset)

    def build(self, parent=None, depth=1) :
        if not parent :
            parent = self.root
        
        if depth > self.max_depth:
            child = add_dict_child(parent)
            self.leaves.append(child)
            return

        for synset in hypo(parent.synset):
            #print (hypo(parent.synset))
            if set(self.levels[depth]) & set([x.replace('_', ' ') for x in synset.lemma_names()]):
                child = add_child(parent, synset)
                self.build(child, depth+1)

        return parent
    
    def get_tag(self, string):
        string = string.replace(' ', '_')
        candidates = []
        for leave in self.leaves:
            if string in leave.dict:
                candidates.append(leave.parent.synset.lemma_names()[0])
            
        return random.choice(candidates) if candidates else 'O'
      
    def decrease_level(self):
        new_leaves = []
        new_tags = set()
        for leave in self.leaves:
            new_tags.add(leave.parent.parent)
    
        for new_tag in new_tags:
            new_tag_dict = set()
            for new_tag_child in new_tag.children:
                new_tag_dict |= new_tag_child.children[0].dict # у тегов один потомок
            
            new_tag_dict |= set(new_tag.synset.lemma_names())          
            new_leaves.append(Leave(new_tag_dict, new_tag))            
            
        self.max_depth -= 1
        self.leaves = new_leaves
    

In [0]:
hierarchy = Hierarchy(levels, wn.synsets('entity')[0])
entity = hierarchy.build()

#whole = wn.synset('whole.n.02')
#malually add artifact, living_thing, synset
artifact = wn.synset('artifact.n.01')
living_thing = wn.synset('living_thing.n.01')
natural_object = wn.synset('natural_object.n.01')


for i in entity.children :
    if i.__str__() == 'physical_entity':
        for j in i.children :
            if j.__str__() == 'object, physical_object':
                for synset in [artifact, living_thing, natural_object]:
                    child = add_child(j, synset)
                    hierarchy.leaves.append(add_dict_child(child))
for i in entity.children :
    if i.__str__() == 'abstraction, abstract_entity':
        for j in i.children :
            if j.__str__() == 'measure, quantity, amount':
                    synset = wn.synset('time_period.n.01')
                    child = add_child(j, synset)
                    hierarchy.leaves.append(add_dict_child(child))
                    



In [0]:
#hierarchy.decrease_level()
#print_tree(hierarchy.root)

In [0]:
#hierarchy.decrease_level()
#hierarchy.get_tag('pure mathematics')

In [0]:
!wget https://github.com/Intelligent-Systems-Phystech/2018-Project-19/raw/master/X.txt

--2018-12-10 09:17:01--  https://github.com/Intelligent-Systems-Phystech/2018-Project-19/raw/master/X.txt
Resolving github.com (github.com)... 192.30.253.113, 192.30.253.112
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Intelligent-Systems-Phystech/2018-Project-19/master/X.txt [following]
--2018-12-10 09:17:02--  https://raw.githubusercontent.com/Intelligent-Systems-Phystech/2018-Project-19/master/X.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7965488 (7.6M) [text/plain]
Saving to: ‘X.txt.2’


2018-12-10 09:17:02 (71.1 MB/s) - ‘X.txt.2’ saved [7965488/7965488]



In [0]:

with open('X.txt') as f:
    content = f.readlines()

In [0]:
import re
len(content)
texts = []
for one_text in content:
    texts.append(one_text.lower())
text = texts[0]
texts_array = []
for text in texts:
    texts_array.append(re.findall(r"[\w']+|[.,!?;\'\"]", text))

In [0]:
def tag_text(hierarchy, text):
    string = ''
    r = 0
    for word_number in range(0,len(text)-1): # почему -1: последний символ всегда '.', а в проверке на словосочетание может выйти за массив
        if r == 1: 
            r = 0
            continue
        if text[word_number]=='.':
            string += '. O\n\n'
        else:
            if hierarchy.get_tag(text[word_number]+' '+text[word_number+1]) == 'O':
                if hierarchy.get_tag(text[word_number]) == 'O':
                    string += text[word_number] + ' O\n'
                else:
                    string += text[word_number] + ' B-' + hierarchy.get_tag(text[word_number]) + '\n'
            else:
                string += text[word_number] + ' B-' + hierarchy.get_tag(text[word_number]) + '\n'
                string += text[word_number+1] + ' I-' + hierarchy.get_tag(text[word_number]) + '\n'
                r = 1
    string += '. O\n\n'
    return string


In [0]:
markup_third_lvl = []
for text in texts_array[:5]: #для проверки пока размечаю первые 200 текстов. Потом можно убрать ограничение
    markup_third_lvl.append(tag_text(hierarchy, text))

In [0]:
#print(markup_third_lvl[0])


In [0]:
markup_second_lvl = []
hierarchy.decrease_level()
for text in texts_array[:5]:
    markup_second_lvl.append(tag_text(hierarchy, text))

In [0]:
markup_first_lvl = []
hierarchy.decrease_level()
for text in texts_array[:5]:
    markup_first_lvl.append(tag_text(hierarchy, text))

In [0]:
def splitter(markup):
    return markup[:3], markup[3:4], markup[4:]

In [0]:
markups = [markup_first_lvl, markup_second_lvl, markup_third_lvl]

for i, markup in enumerate(markups):
    train, test, validation = splitter(markup)
    
    import os

    dest_dir = os.path.join("dataset_" + str(i+1))
    try:
        os.makedirs(dest_dir)
    except OSError:
        pass # already exists
    
    with open("dataset_" + str(i+1) +"/train.txt", "w") as file:
        for j in train:
            print(j, file=file)
    with open("dataset_" + str(i+1)+ "/test.txt", "w") as file:
        for j in test:
            print(j, file=file)
    with open("dataset_" + str(i+1) + "/validation.txt", "w") as file:
        for j in validation:
            print(j, file=file)