### Вопросы на будущее

Насколько сложная структура может быть (насколько сложные и разнообразные документы)? 

Как оценивать вместе с номером уровня заголовка?


## Извлечение текста из pdf
https://www.severcart.ru/blog/all/tesseract_ocr_python/

In [None]:
from pdf2text import pdf2text

## Классификация заголовков

In [1]:
import re
RE_LIST = re.compile(r'\d+(\.\d+)*\D') # для отдельного типа списка
RE_HEADER = re.compile(r'Раздел|Подраздел|Глава|Параграф|Секция|Часть|Статья')

In [2]:
import pytesseract

def mean_bbox_size(img):
    """
    returns (mean_height, mean_width)
    """
    d = pytesseract.image_to_data(img, lang='rus+eng', 
                                  output_type=pytesseract.Output.DICT)
    
    box = [0, 0, 0] # heights, widths, num_lines
    for i in range(len(d['level'])):
        if d['level'][i] == 4:  # bounding box of text line
            box[0] += d['height'][i]
            box[1] += d['width'][i]
            box[2] += 1
    return (box[0] / box[2], box[1] / box[2])

In [11]:
import cv2

class AddImgFeatures:
    def __init__(self):
        pass
    def fit(self):
        pass
    def transform(self, X):
        """
        X - dict {"text": "", "bbox": [], "name": ""}
        computes mean size of bbox
        returns features normalized bbox sizes + mean size of bbox
        """
        path = 'docs'
        mean_heights = {} # {"name": ["mean_height", "mean_width", "height", "width"]}
        features = []
        for elem in X:
            if elem['name'] not in mean_heights: 
                img = cv2.imread(path + '/' + elem['name'])
                mean_height, mean_width = mean_bbox_size(img)
                mean_heights[elem['name']] = [mean_height / img.shape[0], 
                                              mean_width / img.shape[1], 
                                              img.shape[0], img.shape[1]]
            h0 = mean_heights[elem['name']][2]
            w0 = mean_heights[elem['name']][3]
            # normalized left, normalized top?,
            # normalized width, normalized height,
            # mean height, mean width
            features.append([elem['bbox'][0] / w0, #elem['bbox'][1] / h0,
                             elem['bbox'][2] / w0, elem['bbox'][3] / h0,
                             mean_heights[elem['name']][0], 
                             mean_heights[elem['name']][1]])
        return features
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [4]:
class AddFeatures:
    def __init__(self):
        pass
    def fit(self):
        pass
    def transform(self, X):
        """
        features 2 columns: 1-list, 2-header
        X - dict {"text": "", "bbox": [], "name": ""}
        """
        features = []
        for elem in X:
            line = elem['text']
            match = RE_LIST.search(line)
            if match:
                if match.start() == 0:
                    features.append([1, 0])
                    continue
            match = RE_HEADER.search(line)
            if match:
                if match.start() == 0:
                    features.append([0, 1])
                    continue
            features.append([0, 0])
        return features
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [5]:
class string2features:
    def __init__(self):
        pass

    def fit(self):
        pass

    def predict(self):
        pass

    def fit_transform(self, X, y):
        return self.transform(X)

    def transform(self, X):
        """
        X - список строк
        """
        first_words = []
        for elem in X:
            line = elem['text']
            line_words = line.split()
            if len(line_words) > 1:
                first_words.append(line_words[0] + ' ' + line_words[1])
            elif line_words:
                first_words.append(line_words[0])
            else:
                first_words.append('')
        return first_words

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [6]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
#from string2features import string2features

ppl = make_pipeline(FeatureUnion([('aif', AddImgFeatures()), 
                                  ('af', AddFeatures()),
                                  ('cv', make_pipeline(
                                      string2features(),
                                      CountVectorizer(token_pattern=r'(?u)\b\w+\b')))]), 
                     LogisticRegression())

Обучаем модель:
логистическая регрессия, для каждой строки - тип строки + уровень???

как работать с уровнем? строка-название + строка-уровень

In [None]:
import json
with open("file_train2.json", "r") as read_file:
    doc_with_labels = json.load(read_file)

In [None]:
x1 = [doc_with_labels[0]]
aif = AddImgFeatures()
print(aif.transform(x1))
af = AddFeatures()
print(af.transform(x1))

Обучаем и сохраняем обученную модель

In [None]:
import pickle as pkl
clf = ppl.fit(X, y)
pkl.dump(clf, open("model.pkl", "wb"))

Кросс-валидация

In [8]:
import json
with open("file_with_labels2.json", "r") as read_file:
    doc_test = json.load(read_file)
    y = [x["label"] for x in doc_test]
    X = [{"text": x["text"], "bbox": x["bbox"], "name": x["name"]} for x in doc_test]

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
clf = make_pipeline(FeatureUnion([('aif', AddImgFeatures()),
                                  ('af', AddFeatures()),
                                  ('cv', make_pipeline(
                                      string2features(),
                                      CountVectorizer(token_pattern=r'(?u)\b\w+\b')))]), 
                     GradientBoostingClassifier())
scores = cross_validate(clf, X, y, cv=3, scoring='f1_macro')
scores

{'fit_time': array([28.79997182, 30.72871709, 26.89271808]),
 'score_time': array([16.05080819, 22.01762104, 13.94769192]),
 'test_score': array([0.65693608, 0.85328133, 0.77057057])}

без aif 0.37956220744365304

c aif 0.37134152944319837

с нормализованной aif 0.3825286683816205

GradientBoostingClassifier с нормализованной aif 0.38252866838162050.7602626622815247

без aif 0.41892482779902956

In [13]:
import numpy as np
score = np.mean(scores['test_score'])
score

0.7602626622815247

1) размечалка + манифест +

2) признаки на основе предыдущих (следующих) строк ?

3) countvectorizer по первому второму словам +

4) признак - средний размер баундин бокса +

5) gboost +, randomforest (1000 trees) ?