
1) размечалка + манифест +

2) признаки на основе предыдущих (следующих) строк +

3) countvectorizer по первому второму словам +

4) признак - средний размер баундин бокса +

5) gboost +, xgboost +

6) жирность +

7) кросс-валидация по группам +

8) посмотреть, где ошибается классификатор

In [1]:
import re
import cv2
import json
import numpy as np
from functools import cmp_to_key
from functools import reduce
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, cohen_kappa_score

In [3]:
reg_expr = [re.compile(r'\d+(\.\d+)+\D'), # 1.1.1
            re.compile(r'\d\)'), # 1)
            re.compile(r'\w(\.\w)+\W'), # b.b.b
            re.compile(r'[а-яА-Я\d](\.[а-яА-Я\d])+\W'), # б.б.б
            re.compile(r'[a-zA-Z]\)'), # a)
            re.compile(r'[a-zA-Z]\.\W'), # b.
            re.compile(r'[а-яА-Я]\)'), # б)
            re.compile(r'[а-яА-Я]\.\W'), # б.
            re.compile(r'\-'), # -
            re.compile(r'\*'), # *
            re.compile(r'Раздел|Подраздел|Глава|Параграф|Секция|Часть|Статья')]

def add_reg_features(line):
    features = [0] * len(reg_expr)
    i = 0
    for expr in reg_expr:
        match = expr.search(line)
    
        if match:
            if match.start() == 0:
                features[i] = 1
                return features
        i += 1
        
    return features

In [5]:
def mean_bbox_size(bboxes):
    """
    bboxes - list [{"text": "", "bbox": []}, {} ...]
    returns (mean_height, mean_width)
    """
    heigths = np.sum(list(map(lambda x: x['bbox'][2], bboxes)))
    widths = np.sum(list(map(lambda x: x['bbox'][3], bboxes)))
    num_bboxes = len(bboxes)
    
    return (heigths / num_bboxes, widths / num_bboxes)

In [6]:
# add simple font-weight

def mean_color(bbox, img):
    x, y, w, h = bbox
    crop_img = img[y:y + h, x:x + w]
    avg_color_per_row = np.average(crop_img, axis=0)
    avg_color = np.average(avg_color_per_row, axis=0)
    return list(avg_color)

In [7]:
class AddImgFeatures:
    def __init__(self):
        pass
    
    def fit(self):
        pass
    
    def transform(self, X):
        """
        X - [{"name": "doc_name", "entities": [{"text": "", "bbox": []}]}]
        returns features [normalized bbox sizes, normalized mean size of bbox]
        [normalized left, normalized top,
        normalized width, normalized height,
        mean height, mean width]
        """
        features = []
        for doc in X:
            doc_features = []
            doc_name = doc['name']
            doc_info = doc['entities']
            mean_heigth, mean_width = mean_bbox_size(doc_info)
            img = cv2.imread('docs/' + doc_name)
            heigth = img.shape[0]
            width = img.shape[1]
            for line_info in doc_info:
                line_features = [line_info['bbox'][0] / width,
                                line_info['bbox'][1] / heigth,
                                line_info['bbox'][2] / width,
                                line_info['bbox'][3] / heigth,
                                mean_heigth, mean_width]
                line_features += add_reg_features(line_info['text'])
                line_features += mean_color(line_info['bbox'], img)
                doc_features.append(line_features)
            features.append(doc_features)
        return features
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [8]:
def add_prev_next_features(doc, line_features, num_line):
    """
    doc - list of line_features
    line_features - list of features
    """
    add_f = [0] * len(line_features)
    extended_doc = [add_f] * 4 + doc + [add_f] * 4
    return reduce(lambda x, y: x + y, extended_doc[num_line: num_line + 9])

In [13]:
class AddPrevNextFeatures:
    def __init__(self):
        pass
    
    def fit(self):
        pass
    
    def transform(self, X):
        """
        X - list of doc_features
        doc_features - list of line_features
        
        for each line 4 previous and 4 next features added
        result list of lines features
        """
        result = []
        for doc_features in X:
            i = 0
            for line_features in doc_features:
                new_features = add_prev_next_features(doc_features, line_features, i)
                i += 1
                result.append(new_features)
        return np.array(result)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

https://neurohive.io/ru/osnovy-data-science/gradientyj-busting/

https://xgboost.readthedocs.io/en/latest/tutorials/index.html

In [10]:
def str2num(string):
    if string == "text":
        return 3
    if string == "other":
        return 4
    if string == "list":
        return 2
    return 1

with open("data.json", "r") as read_file:
    docs = json.load(read_file)
    y = []
    for doc in docs:
        elem = [str2num(line['label']) for line in doc['entities']]
        y.append(elem)
    X = []
    for doc in docs:
        elem = {}
        elem['name'] = doc['name']
        elem['entities'] = [{'text': line_info['text'], 
                        'bbox': [line_info['x'], line_info['y'],
                                line_info['width'], line_info['height']]}
                      for line_info in doc['entities']]
        X.append(elem)
    X = np.array(X)
    y = np.array(y)

In [11]:
clf = make_pipeline(AddImgFeatures(),
                    AddPrevNextFeatures(),
                    GradientBoostingClassifier())

scores = []

kf = KFold(n_splits=3)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train = reduce(lambda x1, x2: x1 + x2, y[train_index])
    y_test = reduce(lambda x1, x2: x1 + x2, y[test_index])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))

print(scores)
print(np.mean(scores))

[0.7752960550486914, 0.8051448758674601, 0.7886090243312409]
0.7896833184157975


Gradient Boosting 3 folds 0.7896833184157975

In [19]:
from xgboost import XGBClassifier

clf = make_pipeline(AddImgFeatures(),
                    AddPrevNextFeatures(),
                    XGBClassifier())

scores = []

kf = KFold(n_splits=2)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train = np.array(reduce(lambda x1, x2: x1 + x2, y[train_index]))
    y_test = np.array(reduce(lambda x1, x2: x1 + x2, y[test_index]))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))

print(scores)
print(np.mean(scores))

[0.8192133684222185, 0.7794326745590715]
0.799323021490645


https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [17]:
# scores = []

# param = {
#    'max_depth': 3,
#    'eta': 0.3, 
#    'silent': 1, 
#    'objective': 'multi:softprob',
#    'num_class': 4}
# num_round = 20

# kf = KFold(n_splits=3)
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train = reduce(lambda x1, x2: x1 + x2, y[train_index])
#     y_test = reduce(lambda x1, x2: x1 + x2, y[test_index])
    
#     aif = AddImgFeatures()
#     af = AddPrevNextFeatures()
    
#     X_train = aif.fit_transform(X_train)
#     X_train = np.array(af.fit_transform(X_train))   
#     X_test = aif.fit_transform(X_test)
#     X_test = np.array(af.fit_transform(X_test))
    
#     dtrain = xgb.DMatrix(X_train, label=y_train)
#     dtest = xgb.DMatrix(X_test, label=y_test)
    
#     bst = xgb.train(param, dtrain, num_round)
#     preds = bst.predict(dtest)
    
#     y_pred = np.asarray([np.argmax(line) for line in preds])
#     scores.append(f1_score(y_test, y_pred, average='macro'))

# print(scores)
# print(np.mean(scores))