### Вопросы на будущее

Как оценивать вместе с номером уровня заголовка?

1) размечалка + манифест +

2) признаки на основе предыдущих (следующих) строк ?

3) countvectorizer по первому второму словам +

4) признак - средний размер баундин бокса +

5) gboost +, randomforest (1000 trees) ?

6) жирность

7) кросс-валидация по группам +

In [1]:
import re
import pytesseract
import cv2
import json
import numpy as np
from functools import cmp_to_key
from functools import reduce
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, cohen_kappa_score

## Классификация заголовков

In [2]:
def mean_bbox_size(bboxes):
    """
    bboxes - list [{"text": "", "bbox": []}, {} ...]
    returns (mean_height, mean_width)
    """
    heigths = np.sum(list(map(lambda x: x['bbox'][2], bboxes)))
    widths = np.sum(list(map(lambda x: x['bbox'][3], bboxes)))
    num_bboxes = len(bboxes)
    
    return (heigths / num_bboxes, widths / num_bboxes)

In [3]:
class AddImgFeatures:
    def __init__(self):
        pass
    
    def fit(self):
        pass
    
    def transform(self, X):
        """
        X - [{"name": "doc_name", "entities": [{"text": "", "bbox": []}]}]
        returns features [normalized bbox sizes, normalized mean size of bbox]
        [normalized left, normalized top,
        normalized width, normalized height,
        mean height, mean width]
        """
        features = []
        for doc in X:
            doc_name = doc['name']
            doc_info = doc['entities']
            mean_heigth, mean_width = mean_bbox_size(doc_info)
            img = cv2.imread('docs/' + doc_name)
            heigth = img.shape[0]
            width = img.shape[1]
            for line_info in doc_info:
                features.append([line_info['bbox'][0] / width,
                                line_info['bbox'][1] / heigth,
                                line_info['bbox'][2] / width,
                                line_info['bbox'][3] / heigth,
                                mean_heigth, mean_width])
        return features
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [4]:
RE_LIST = re.compile(r'\d+(\.\d+)*\D')
RE_HEADER = re.compile(r'Раздел|Подраздел|Глава|Параграф|Секция|Часть|Статья')

In [5]:
class AddRegFeatures:
    def __init__(self):
        pass
    
    def fit(self):
        pass
    
    def transform(self, X):
        """
        X - [{"name": "doc_name", "entities": [{"text": "", "bbox": []}]}]
        returns features: [list, header] 1 - yes, 0 - no
        """
        features = []
        for doc in X:
            doc_info = doc['entities']
            for line_info in doc_info:
                line = line_info['text']
                match = RE_LIST.search(line)
                if match:
                    if match.start() == 0:
                        features.append([1, 0])
                        continue
                match = RE_HEADER.search(line)
                if match:
                    if match.start() == 0:
                        features.append([0, 1])
                        continue
                features.append([0, 0])
        return features
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [6]:
class string2features:
    def __init__(self):
        pass

    def fit(self):
        pass

    def predict(self):
        pass

    def fit_transform(self, X, y):
        return self.transform(X)

    def transform(self, X):
        """
        X - [{"name": "doc_name", "entities": [{"text": "", "bbox": []}]}]
        returns: list of first 1-2 words of each line
        """
        first_words = []
        for doc in X:
            doc_info = doc['entities']
            for line_info in doc_info:
                line_words = line_info['text'].split()
                if len(line_words) > 1:
                    first_words.append(line_words[0] + ' ' + line_words[1])
                elif line_words:
                    first_words.append(line_words[0])
                else:
                    first_words.append('')
        return first_words

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

https://neurohive.io/ru/osnovy-data-science/gradientyj-busting/

In [10]:
with open("data.json", "r") as read_file:
    docs = json.load(read_file)
    y = []
    for doc in docs:
        elem = [line['label'] for line in doc['entities']]
        y.append(elem)
    X = []
    for doc in docs:
        elem = {}
        elem['name'] = doc['name']
        elem['entities'] = [{'text': line_info['text'], 
                        'bbox': [line_info['x'], line_info['y'],
                                line_info['width'], line_info['height']]}
                      for line_info in doc['entities']]
        X.append(elem)
    X = np.array(X)
    y = np.array(y)

In [11]:
clf = make_pipeline(FeatureUnion([('aif', AddImgFeatures()),
                                  ('af', AddRegFeatures()),
                                  ('cv', make_pipeline(
                                      string2features(),
                                      CountVectorizer(token_pattern=r'(?u)\b\w+\b')))]), 
                     GradientBoostingClassifier())

scores = []

kf = KFold(n_splits=3)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train = reduce(lambda x1, x2: x1 + x2, y[train_index])
    y_test = reduce(lambda x1, x2: x1 + x2, y[test_index])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))

print(scores)
print(np.mean(scores))

[0.7641779581021316, 0.7624716061411333, 0.6801341912201196]
0.7355945851544615


LogisticRegression с 10 примерами 0.3825286683816205

GradientBoostingClassifier с 10 примерами 0.7602626622815247


GradientBoostingClassifier с 32 примерами на 3 фолдах 0.6904364633429961

GradientBoostingClassifier с 46 примерами на 3 фолдах 0.6863249684821809

с 142 примерами 0.7355945851544615

cohen_kappa_score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html

In [None]:
with open("file_with_labels2.json", "r") as read_file:
    labeled_doc = json.load(read_file)
with open("file_with_labels2_ilya.json", "r") as read_file:
    labeled_doc_ilya = json.load(read_file)

In [None]:
def cmp(x, y):
    if x['name'] == y['name']:
        if x['bbox'][1] < y['bbox'][1]:
            return -1
        else:
            return 1
    elif x['name'] < y['name']:
        return -1 
    else:
        return 1

In [None]:
labeled_doc.sort(key=cmp_to_key(cmp))

In [None]:
labeled_doc

In [None]:
labeled_doc_ilya.sort(key=cmp_to_key(cmp))

In [None]:
labeled_doc_ilya

In [None]:
labels1 = [x["label"] for x in labeled_doc]
labels2 = [x["label"] for x in labeled_doc_ilya]

In [None]:
cohen_kappa_score(labels1, labels2, labels=[1, 2, 3, 4])

In [None]:
d = {}
for i, elem in enumerate(labeled_doc):
    d[(tuple(elem['bbox']), elem['name'])] = [labels1[i], labels2[i]]
d

In [None]:
for item in d.items():
    if (len(item[1]) == 1) or (len(item[1]) == 2 
                               and item[1][0] != item[1][1]):
        img = cv2.imread('different_docs/' + item[0][1])
        (x, y, w, h) = item[0][0]
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.imwrite('different_docs/' + item[0][1], img)