In [1]:
from tz_features_extractor import TzTextFeatures
from document_parser import DOCXParser

In [2]:
names = ["/Users/anastasiabogatenkova/DOCXParser/data/1611135278_935.docx",
         "/Users/anastasiabogatenkova/DOCXParser/data/1611135281_498.docx"]

docs = []
for name in names:
    docx_parser = DOCXParser()
    docx_parser.parse(name)
    docs.append(docx_parser.get_lines_with_meta())

print(docs[0][0])

{'text': 'VIII. ТЕХНИЧЕСКОЕ ЗАДАНИЕ', 'level': (2, 1), 'uid': 'cc37aac1532ff4f4b871b61ecbb136f9', 'type': 'style_header', 'annotations': [('indentation', 0, 25, '0'), ('alignment', 0, 25, 'center'), ('bold', 0, 25, 'True'), ('size', 0, 25, '12.0'), ('style', 0, 25, 'heading 1')]}


In [3]:
features_extractor = TzTextFeatures()
features = features_extractor.transform(docs)


In [2]:
import os
from typing import Optional


def skip_labels(label: str) -> Optional[str]:
    if label not in ("other", "footer"):
        return label
    return None


In [1]:
from classifiers.tz_classifier.tz_features_extractor import TzTextFeatures
from classifiers.tz_classifier.tz_classifier_trainer import TzClassifierTrainer

In [3]:
resources_path = "/Users/anastasiabogatenkova/DOCXParser/classifiers/tz_classifier/resources"
path_out = os.path.join(resources_path, "tz_classifier.pkl.gz")
data_path = "/Users/anastasiabogatenkova/DOCXParser/data/labeled_tz.json"

feature_extractor = TzTextFeatures()
classifier_parameters = dict(learning_rate=0.2,
                             n_estimators=600,
                             booster="gbtree",
                             max_depth=5,
                             colsample_bynode=0.1,
                             colsample_bytree=1)

In [4]:
trainer = TzClassifierTrainer(
    data_path=data_path,
    feature_extractor=feature_extractor,
    path_out=path_out,
    path_log=resources_path,
    label_transformer=skip_labels,
    classifier_parameters=classifier_parameters,
    random_seed=42,
)

trainer.fit(cross_val_only=True)

100%|██████████| 10/10 [00:09<00:00,  1.06it/s]


save errors in /Users/anastasiabogatenkova/DOCXParser/classifiers/tz_classifier/resources/errors
true             -> predicted       	 cnt	 percent
item             -> toc              00,074 (21.70%)
item             -> part             00,059 (17.30%)
item             -> raw_text         00,039 (11.44%)
toc              -> item             00,035 (10.26%)
title            -> raw_text         00,028 (8.21%)
raw_text         -> title            00,026 (7.62%)
part             -> raw_text         00,020 (5.87%)
part             -> item             00,018 (5.28%)
toc              -> raw_text         00,014 (4.11%)
raw_text         -> toc              00,011 (3.23%)
toc              -> title            00,010 (2.93%)
part             -> toc              00,005 (1.47%)
raw_text         -> item             00,001 (0.29%)
raw_text         -> part             00,001 (0.29%)


In [5]:
trainer.fit(cross_val_only=False)


100%|██████████| 10/10 [00:11<00:00,  1.11s/it]


save errors in /Users/anastasiabogatenkova/DOCXParser/classifiers/tz_classifier/resources/errors
true             -> predicted       	 cnt	 percent
item             -> toc              00,074 (21.70%)
item             -> part             00,059 (17.30%)
item             -> raw_text         00,039 (11.44%)
toc              -> item             00,035 (10.26%)
title            -> raw_text         00,028 (8.21%)
raw_text         -> title            00,026 (7.62%)
part             -> raw_text         00,020 (5.87%)
part             -> item             00,018 (5.28%)
toc              -> raw_text         00,014 (4.11%)
raw_text         -> toc              00,011 (3.23%)
toc              -> title            00,010 (2.93%)
part             -> toc              00,005 (1.47%)
raw_text         -> item             00,001 (0.29%)
raw_text         -> part             00,001 (0.29%)
data train shape (1376, 150)
Save scores in /Users/anastasiabogatenkova/DOCXParser/classifiers/tz_classifier/resources/s