In [20]:
import os
import pickle
import sklearn_crfsuite
import collections
import glob
from deepmerge import always_merger
from IPython.display import HTML, display
import tabulate
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm
import io
import pandas as pd

In [2]:
%load_ext autoreload

In [3]:
%pwd

'C:\\Users\\User\\ML\\IBS'

In [4]:
os.chdir('d:/Projects/IT/IBS/ibs_web')

In [5]:
%pwd

'd:\\Projects\\IT\\IBS\\ibs_web'

In [6]:
from parsers.CrfPredictor import CrfPredictor
from parsers.DocxParser import DocxParser
from parsers.PullentiParser import PullentiParser
from parsers.FeaturesLabelsIterator import FeaturesLabelsIterator
from extractors.IbsAttrExtractor import IbsAttrExtractor
from utils import flatten

In [7]:
%autoreload 2

### Извлекаем файкты как они есть

In [50]:
input_path = 'd:/Projects/IT/IBS/data/input-599'
output_file = 'd:/Projects/IT/IBS/data/facts_export/facts-599.xlsx'
docx_files = glob.glob(os.path.join(input_path, '**', '*.docx'), recursive=True)
print('Files: {}'.format(len(docx_files)))

Files: 602


In [49]:
docxParser = DocxParser()
datasets = {}

for docx_file in docx_files:
    docxParser.parse(docx_file)
#     try:
    true_facts = docxParser.get_fact_values()
#     except Exception as e:
#         print(docx_file)
    rel_path = docx_file[len(input_path):]
    for true_fact in true_facts:
        for tag, value in true_fact.items():
            if tag not in datasets:
                datasets[tag] = pd.DataFrame()
            datasets[tag] = datasets[tag].append([{
                    'file': rel_path,
                    'value': value,
                }], ignore_index=True)

In [51]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(output_file, engine='xlsxwriter')

for tag in datasets.keys():
    # Write each dataframe to a different worksheet.
    datasets[tag].to_excel(writer, sheet_name=tag)
    
# Close the Pandas Excel writer and output the Excel file.
writer.save()

### Функция извлечения фактов модифицированная чтобы доставать именованные сущности pullenti

In [62]:
def get_fact_values(pullentiParser, labels, marginals, scan_text, scan_tables):
    fact_values = {}

    start_idx = None
    end_idx = None
    label = None
    label_idx = 0
    prob = None

    if scan_text:
        # process text
        for word_idx, word in enumerate(pullentiParser.src_text):
            # if found start tag (B-)
            if labels[label_idx].startswith('B-'):
                start_idx = word_idx
                label = labels[label_idx][2:]
                if marginals is not None:
                    prob = marginals(labels[label_idx], label_idx)
                label_idx += 1
                continue

            # found end tag
            if label is not None and label != labels[label_idx]:
                end_idx = word_idx
                # extract fact value
                words = []
                for i in range(start_idx, end_idx):
                    if '(' in pullentiParser.morphs[i] and ')' in pullentiParser.morphs[i]:
                        words.append(pullentiParser.morphs[i])
                    else:
                        words.append(pullentiParser.src_text[i])
                value = ' '.join(words)
                
#                 value = ' '.join(pullentiParser.src_text[start_idx:end_idx])
                FeaturesLabelsIterator.append_fact_value(fact_values, label, value, prob)

                start_idx = None
                end_idx = None
                label = None

            label_idx += 1

        # достигли конца текста, а окончание тэга не обработано
        if label is not None:
            value = ' '.join(pullentiParser.src_text[start_idx:])
            FeaturesLabelsIterator.append_fact_value(fact_values, label, value, prob)

    if scan_tables:
        # process tables
        for tbl_idx, tbl_src_text in enumerate(pullentiParser.tables_src_text):
            for row_idx, row_src_text in enumerate(tbl_src_text):
                for cell_idx, cell_src_text in enumerate(row_src_text):
                    if cell_src_text is not None and len(cell_src_text) > 0:
                        # pullentiParser.tables_labels[tbl_idx][row_idx][cell_idx] = []
                        start_idx = None
                        end_idx = None
                        label = None
                        prob = None

                        for word_idx, word in enumerate(cell_src_text):

                            # update table label
                            # pullentiParser.tables_labels[tbl_idx][row_idx][cell_idx].append(labels[label_idx])

                            # found end tag
                            if label is not None and label != labels[label_idx]:
                                end_idx = word_idx
                                words = []
                                morphs = pullentiParser.tables_morphs[tbl_idx][row_idx][cell_idx]
                                for i in range(start_idx, end_idx):
                                    if '(' in morphs[i] and ')' in morphs[i]:
                                        words.append(morphs[i])
                                    else:
                                        words.append(cell_src_text[i])
                                value = ' '.join(words)
#                                 value = ' '.join(cell_src_text[start_idx:end_idx])
                                FeaturesLabelsIterator.append_fact_value(fact_values, label, value, prob)

                                start_idx = None
                                end_idx = None
                                label = None

                            # if found start tag (B-)
                            if labels[label_idx].startswith('B-'):
                                start_idx = word_idx
                                label = labels[label_idx][2:]
                                if marginals is not None:
                                    prob = marginals(labels[label_idx], label_idx)
                                label_idx += 1
                                continue

                            label_idx += 1

                        # достигли конца ячейки, а окончание тэга не обработано
                        if label is not None:
                            words = []
                            morphs = pullentiParser.tables_morphs[tbl_idx][row_idx][cell_idx]
                            for i in range(start_idx, len(cell_src_text)):
                                if '(' in morphs[i] and ')' in morphs[i]:
                                    words.append(morphs[i])
                                else:
                                    words.append(cell_src_text[i])
                            value = ' '.join(words)
#                             value = ' '.join(cell_src_text[start_idx:])
                            FeaturesLabelsIterator.append_fact_value(fact_values, label, value, prob)

    return fact_values

### Работаем с файлами

In [11]:
input_path = 'd:/Projects/IT/IBS/data/pickle-input-data-599'
pickle_files = glob.glob(os.path.join(input_path, '**', '*.pickle'), recursive=True)
print('Files: {}'.format(len(pickle_files)))

Files: 599


In [63]:
pullentiParser = PullentiParser()

ks = []
goals = []
tasks = []

for pickle_file in tqdm(pickle_files[:1]):
    pullentiParser.load_from_pickle_file(pickle_file)
#     true_facts = pullentiParser._get_true_facts(pullentiParser)
    facts = get_fact_values(pullentiParser, pullentiParser.labels + list(flatten(pullentiParser.tables_labels)), None, True, True)
    print(facts)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

{'O1': [{'value': 'Администрации (geo)', 'prob': None}], 'D': [{'value': '(date)', 'prob': None}], 'R': [{'value': '(number)', 'prob': None}], 'O2': [{'value': 'отдел по делам молодежи и спорту Администрации (geo)', 'prob': None}], 'ND1': [{'value': 'муниципальная программа « Развитие физической культуры и спорта в (geo) » на (daterange)', 'prob': None}], 'G': [{'value': 'создание условий для укрепления здоровья населения Зонального района путем развития инфраструктуры спорта и приобщения различных слоев населения к регулярным занятиям физической культурой и спортом', 'prob': None}], 'T': [{'value': 'создание правовых , экономических , социальных и организационных условий для развития массовой физической культуры и спорта в (geo) ;', 'prob': None}, {'value': 'создание оптимальных условий для развития в (geo) детско - юношеского и массового спорта ;', 'prob': None}, {'value': 'формирование у населения навыков здорового образа жизни , воспитание осознанной потребности в физическом соверш

In [64]:
def get_fact_text(facts, tag):
    """ИТзвлечение текста фактов"""
    result = []
    if tag in facts:
        result = [fv['value'] for fv in facts[tag]]
        result = list(set(result))
    
    return result

In [73]:
pullentiParser = PullentiParser()

ks = []
goals = []
tasks = []

for pickle_file in tqdm(pickle_files):
    pullentiParser.load_from_pickle_file(pickle_file)
    facts = get_fact_values(pullentiParser, pullentiParser.labels + list(flatten(pullentiParser.tables_labels)), None, True, True)
    
    ks.extend(get_fact_text(facts, 'K'))
    goals.extend(get_fact_text(facts, 'G'))
    tasks.extend(get_fact_text(facts, 'T'))

HBox(children=(IntProgress(value=0, max=599), HTML(value='')))




### пишем в файл

In [74]:
out_path = 'd:/Projects/IT/IBS/data/facts_export'

with io.open(os.path.join(out_path, 'K.txt'), 'w', encoding='utf8') as f:
    f.write("\n".join(ks))
    
with io.open(os.path.join(out_path, 'G.txt'), 'w', encoding='utf8') as f:
    f.write("\n".join(goals))
    
with io.open(os.path.join(out_path, 'T.txt'), 'w', encoding='utf8') as f:
    f.write("\n".join(tasks))