In [None]:
# код из isanlp

class Segment:
    def __init__(self, _id, parent, relname, text):
        self.id = _id
        self.parent = parent
        self.relname = relname
        self.text = text

    def __str__(self):
        if self.parent:
            return f'<segment id="{self.id}" parent="{self.parent}" relname="{self.relname}">{self.text}</segment>'
        
        return f'<segment id="{self.id}" relname="{self.relname}">{self.text}</segment>'

class GroupCreator:
    def __init__(self, _id):
        self._id = _id

    def __call__(self, type, parent, relname):
        self._id += 1
        return Group(self._id, type, parent, relname)


class Group:
    def __init__(self, _id, type, parent, relname):
        self.id = _id
        self.type = type
        self.parent = parent
        self.relname = relname

    def __str__(self):
        return f'<group id="{self.id}" type="{self.type}" parent="{self.parent}" relname="{self.relname}"/>'


class Root(Group):
    def __init__(self, _id):
        Group.__init__(self, _id, type="span", parent=-1, relname="span")

    def __str__(self):
        return f'<group id="{self.id}" type="{self.type}"/>'


class Exporter:
    def __init__(self, encoding='utf-8'):
        self._encoding = encoding
    
    def __call__(self, tree, filename):

        with open(filename, 'w', encoding=self._encoding) as fo:
            fo.write('<rst>\n')
            fo.write(self.make_header(tree))
            fo.write(self.make_body(tree))
            fo.write('</rst>')

    def compile_relation_set(self, tree):
        result = ['_'.join([tree.relation, tree.nuclearity])]
        if not tree.left:
            return result
        if tree.left.left:
            result += self.compile_relation_set(tree.left)
        if tree.right.left:
            result += self.compile_relation_set(tree.right)
            
        return result

    def make_header(self, tree):
        relations = list(set(self.compile_relation_set(tree)))
        relations = [value if value != "elementary__" else "antithesis_NN" for value in relations]
        result = '\t<header>\n'
        result += '\t\t<relations>\n'
        for rel in relations:
            _relname, _type = rel.split('_')[:2]
            _type = 'multinuc' if _type == 'NN' else 'rst'
            result += f'\t\t\t<rel name="{_relname}" type="{_type}" />\n'
        result += '\t\t</relations>\n'
        result += '\t</header>\n'

        return result

    def get_groups_and_edus(self, tree):
        groups = []
        edus = []

        if not tree.left:
            edus.append(Segment(tree.id, parent=None, relname='antithesis', text=tree.text))
            return groups, edus

        if not tree.left.left:
            if tree.nuclearity == "SN":
                edus.append(Segment(tree.left.id, tree.right.id, tree.relation, tree.left.text))
            elif tree.nuclearity == "NS":
                edus.append(Segment(tree.left.id, tree.id, 'span', tree.left.text))
            else:
                edus.append(Segment(tree.left.id, tree.id, tree.relation, tree.left.text))
        else:
            if tree.nuclearity == "SN":
                groups.append(Group(tree.left.id, 'span', tree.right.id, tree.relation))
            elif tree.nuclearity == "NS":
                groups.append(Group(tree.left.id, 'span', tree.id, 'span'))
            else:
                groups.append(Group(tree.left.id, 'span', tree.id, tree.relation))
                #groups.append(Group(tree.left.id, 'multinuc', tree.id, tree.relation))

            _groups, _edus = self.get_groups_and_edus(tree.left)
            groups += _groups
            edus += _edus

        if not tree.right.left:
            if tree.nuclearity == "SN":
                edus.append(Segment(tree.right.id, tree.id, 'span', tree.right.text))
            elif tree.nuclearity == "NS":
                edus.append(Segment(tree.right.id, tree.left.id, tree.relation, tree.right.text))
            else:
                edus.append(Segment(tree.right.id, tree.id, tree.relation, tree.right.text))

        else:
            if tree.nuclearity == "SN":
                groups.append(Group(tree.right.id, 'span', tree.id, 'span'))
            elif tree.nuclearity == "NS":
                groups.append(Group(tree.right.id, 'span', tree.left.id, tree.relation))
            else:
                groups.append(Group(tree.right.id, 'span', tree.id, tree.relation))
                #groups.append(Group(tree.right.id, 'multinuc', tree.id, tree.relation))

            _groups, _edus = self.get_groups_and_edus(tree.right)
            groups += _groups
            edus += _edus

        return groups, edus

    def make_body(self, tree):
        groups, edus = self.get_groups_and_edus(tree)
        if len(edus) > 1:
            groups.append(Root(tree.id))

        result = '\t<body>\n'
        for edu in edus + groups:
            result += '\t\t' + str(edu) + '\n'
        result += '\t</body>\n'

        return result

class ForestExporter:
    def __init__(self, encoding='utf-8'):
        self._encoding = encoding
        self._tree_exporter = Exporter(self._encoding)
    
    def __call__(self, trees, filename):
        
        with open(filename, 'w', encoding=self._encoding) as fo:
            fo.write('<rst>\n')
            fo.write(self.make_header(trees))
            fo.write(self.make_body(trees))
            fo.write('</rst>')
        
    def compile_relation_set(self, trees):
        result = []
        
        for tree in trees:
            result += list(set(self._tree_exporter.compile_relation_set(tree)))
               
        result = [value if value != "elementary__" else "antithesis_NN" for value in result]
        return result
    
    def make_header(self, trees):
        relations = list(set(self.compile_relation_set(trees)))

        result = '\t<header>\n'
        result += '\t\t<relations>\n'
        for rel in relations:
            _relname, _type = rel.split('_')
            _type = 'multinuc' if _type == 'NN' else 'rst'
            result += f'\t\t\t<rel name="{_relname}" type="{_type}" />\n'
        result += '\t\t</relations>\n'
        result += '\t</header>\n'

        return result
    
    def make_body(self, trees):
        groups, edus = [], []
        
        for tree in trees:
            _groups, _edus = self._tree_exporter.get_groups_and_edus(tree)
            if len(_edus) > 1:
                _groups.append(Root(tree.id))
            groups += _groups
            edus += _edus

        result = '\t<body>\n'
        for edu in edus + groups:
            result += '\t\t' + str(edu) + '\n'
        result += '\t</body>\n'

        return result.replace('\u2015', '-')

In [None]:
import copy
import os
import pickle

import pandas as pd

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

In [None]:
with open(r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде\tree rule_3_ar_result_micro_k017', 'br') as f:
    

In [None]:
texts = {}
trees = {}
results = {}
names = os.listdir(r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде')
for name in names:
    with open(r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде' + '\\' + name, 'br') as f:
            obj = pickle.load(f)
            if 'tree' in name:
                trees[name] = obj
            else:
                if 'result' in name:
                    results[name] = obj
                else:
                    texts[name] = obj

In [None]:
c_verbs = set(["хотеть", "подумать", "думать", "сказать", "решить"])

In [None]:
def match_lemmas(tokens, lemmas):
    straight_lemmas = []
    for sentence in lemmas:
        for lemma in sentence:
            straight_lemmas.append(lemma)
    tokens_texts = []
    for token in tokens:
        tokens_texts.append(token.text)
    return dict(zip(tokens_texts, straight_lemmas))

In [None]:
def lemmatize(tree, tokens, lemmas):
    words = []
    for token in tokens:
        if tree.start <= token.begin and token.end <= tree.end:
            words.append(lemmas[token.text])
    return words

In [None]:
def rule_1(tree, tokens, lemmas):
    return tree.relation == 'attribution'


def rule_2(tree, tokens, lemmas):
    if tree.relation == 'elaboration':
        lemmas = lemmatize(tree, tokens, lemmas)
        if 'который' in lemmas:
            return True
    return False


def rule_3(tree, tokens, lemmas):
    if tree.relation != 'elementary':
        lemmas = lemmatize(tree, tokens, lemmas)
        if len(c_verbs & set(lemmas)) != 0:
            return True
        else:
            return False
    else:
        return False

rules = [rule_1, rule_2, rule_3]

In [None]:
def conditions_failed(tree, rules, tokens, lemmas):
    for rule in rules:
        if rule(tree, tokens, lemmas):
            return True
    return False

In [None]:
def corrected(tree, rules, tokens, lemmas):
    if conditions_failed(tree, rules, tokens, lemmas):
        return delete_relation(tree)
    else:
        return tree

In [None]:
def delete_relation(tree):
    if tree.nuclearity == 'SN':
        attr = tree.left
        attr.relation = 'elementary'
        attr.nuclearity = 'SN'
        n = left_n_leaf(tree.right)
        if (n.start - attr.end) > 1:
            n_copy = copy.deepcopy(n)
            n.right = n_copy 
            n.left = attr
            n.relation = 'same-unit'
            n.nuclearity = 'NN'
        else:
            n.start = attr.start
            n.text = attr.text + n.text
        return tree.right
            
    else:
        attr = tree.right
        attr.relation = 'elementary'
        attr.nuclearity = 'NS'
        n = right_n_leaf(tree.left)
        if (attr.start - n.end) > 1:
            n_copy = copy.deepcopy(n)
            n.left = n_copy
            n.right = attr
            n.relation = 'same-unit_NN'
        else:           
            n.end = attr.end
            n.text += attr.text
        return tree.left

In [None]:
def left_n_leaf(tree):
    if tree.left:
        if tree.nuclearity == 'SN':
            return left_n_leaf(tree.right)
        else:
            return left_n_leaf(tree.left)
    else:
        return tree

In [None]:
def right_n_leaf(tree):
    if tree.right:
        if tree.nuclearity == 'NS':
            return right_n_leaf(tree.left)
        else:
            return right_n_leaf(tree.right)
    else:
        return tree

In [None]:
def to_rs3(tree, filename):
    exp = Exporter()
    exp(tree, filename)

In [None]:
def segmentation(tree, text, segments, rules, tokens, lemmas):
    if tree.relation != 'elementary':
        tree.left = corrected(tree.left, rules, tokens, lemmas)
        tree.right = corrected(tree.right, rules, tokens, lemmas)
        segmentation(tree.left, text, segments, rules, tokens, lemmas)
        segmentation(tree.right, text, segments, rules, tokens, lemmas)
    else:
        segments.append(text[tree.start:tree.end])

In [None]:
def texts_segmentation(rules, filename):
    segments =  []
    for key in results:
    #for i in range(2):
        #key = list(results.keys())[i]
        tree = results[key]['rst'][0]
        to_rs3(tree, r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде деревья\raw_tree ' + key.split('.')[0] + '.rs3')
        text = results[key]['text']
        tokens = results[key]['tokens']
        lemmas = results[key]['lemma']
        lemmas = match_lemmas(tokens, lemmas)
        segmentation(tree, text, segments, rules, tokens, lemmas)
        to_rs3(tree, r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде деревья\tree ' + filename + '_' + key.split('.')[0] + '.rs3')
        # для текстов b021,b023 и b050 автосегментатотор почему-то вернул два дерева вместо одного, поэтому нужно приделать второе дерево(там одно эде)
        #очень костыльно конечно, но думаю так не должно быть в других текстах, какой-то сбой
        if len(results[key]['rst']) > 1:
            segments.append(results[key]['rst'][1].text)
        segments.append('\n')
    return segments

In [None]:
new_array=range(len(rules))
power_set=[[]]
for x in new_array:
    for i in range(len(power_set)):
        tmp_list = power_set[i].copy()
        tmp_list.append(x)
        power_set.append(tmp_list)
power_set = power_set[1:]

In [None]:
%%time
for s in power_set:
    current_rules = []
    for i in s:
        current_rules.append(rules[i])
    print(current_rules)
    filename_raw = str(current_rules).split()
    filename = []
    for word in filename_raw:
        if 'rule' in word:
            filename.append(word)
    filename = '__'.join(filename)
    segments = texts_segmentation(current_rules, filename)
    filename += '.xlsx'
    print(filename + ' сегментирован')
    exls = pd.DataFrame(segments)
    exls.to_excel(filename, index = False)
    print(filename + ' записан')

In [None]:
stop

## Дальше идёт черновик

In [None]:
from isanlp.annotation_rst import ForestExporter
exporter = ForestExporter(encoding='utf8')
exporter(results[list(results.keys())[0]]['rst'][0], 'testtesttest.rs3')

In [None]:
results[list(results.keys())[0]]['rst'][0].to_rst('testtesttest.rs3')

In [None]:
results['ar_result_micro_b001.pickle']['rst'][0]

In [None]:
e = Exporter()

In [None]:
e.get_groups_and_edus(results['ar_result_micro_b001.pickle']['rst'][0])

In [None]:
#b4,b15,,b21,b23,b50

In [None]:
results['ar_result_micro_b004.pickle']['text']

In [None]:
print(results['ar_result_micro_b015.pickle']['rst'][0].right.left)

In [None]:
# d = match_lemmas(results['ar_result_micro_k031.pickle']['tokens'], results['ar_result_micro_k031.pickle']['lemma'])

In [None]:
results['ar_result_micro_k031.pickle']

In [None]:
t = results['ar_result_micro_k031.pickle']['rst'][0]

In [None]:
print(t.right)

In [None]:
lemmatize(t.left, results['ar_result_micro_k031.pickle']['tokens'], d)

In [None]:
def nones(data, col1, col2):
    l1 = list(data[col1])
    l2 = list(data[col2])
    l1_new = []
    l2_new = []
    for i in range(len()):
        current_1 = l1[i]
        current_2 = l2[i]
        if current_1 == current_2:
            l1_new.append(current_1)
            l2_new.append(current_2)
        else:
            if l1 in l2:
                l1_new.append(current_1)
                l2_new.append(current_2)
                l2_new.append('None')
            elif l2 in l1:
                l1_new.append(current_1)
                l1_new.append('None')
                l2_new.append(current_2)
            else:
                
            
    df.head()
    for segment in segment 

In [None]:
erer

In [None]:
%%time
m = Mystem()
segments = texts_segmentation(rules[:2])

In [None]:
exls = pd.DataFrame(segments)
exls.to_excel('правила_v2_attr и elab.xlsx', index = False)

In [None]:
exls.head(20)

In [None]:
all_variants = []
for i in range(1, len(rules)+1):
    for j in range(i):
        for 

In [None]:
erer

In [None]:
#segments = []
#segmentation(results['ar_result_micro_b006.pickle']['rst'][0], results['ar_result_micro_b006.pickle']['text'], segments)

In [None]:
segments

In [None]:
texts['ar_micro_b006.pickle']

In [None]:
a = trees['ar_tree_micro_b006.pickle'].right.start
b = trees['ar_tree_micro_b006.pickle'].left.end
print(a, b)

In [None]:
t = trees['ar_tree_micro_b006.pickle']
print(left_n_leaf(t.left))

In [None]:
trees['ar_tree_micro_b006.pickle'].end

In [None]:
res = results['ar_result_micro_b006.pickle']
res['syntax_dep_tree'][2][5].link_name
res

In [None]:
lemmas = ['']
for i in range(len(res['sentenses']))

In [None]:
res['tokens'][0]

In [None]:
df = pd.DataFrame(res)

In [None]:
texts = {}
trees = {}
results = {}
names = os.listdir(r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде')
for name in names:
    with open(r'C:\Users\Иннокентий\Documents\Проект_НИС_магистратура_первый курс\русский авторазметчик эде' + '\\' + name, 'br') as f:
            obj = pickle.load(f)
            if 'tree' in name:
                trees[name] = obj
            else:
                if 'result' in name:
                    results[name] = obj
                else:
                    texts[name] = obj

In [None]:
def segmentation(tree):
    if tree.relation != 'elementary':
        segmentation(tree.left)
        segmentation(tree.right)
    else:
        segments.append(tree.text)

In [None]:
for key in results:
    

In [None]:
res = results[list(results.keys())[0]]

In [None]:
segmentation(res['rst'][0])

In [None]:
segments