In [None]:
!pip install python-docx
!pip install catboost

In [None]:
!gdown --id 1zJnmnqJ7uIJz2FNjWYgXbihtx6vT-DH5# Loading a dataset
!7z x DataSet_Razmetra.7z -oDataSet_Razmetra# Unpacking the dataset

In [None]:
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm

from docx import Document
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier

In [None]:
file_names = glob.glob('DataSet_Razmetra/**/**/**/**/Edition_Text.docx')

In [None]:
res = []
for f_name in file_names:
    res.append(f_name.split('/')[2])

pd.Series(res).value_counts()

In [None]:
# we look at the metrics separately for each corruption factor
# a dataset for training is formed from marked files, in which a corruption factor was detected
# and fixed files
# the dataset is divided into train and validation
# get TF-IDF from texts
# train boosting on these vectors
# look at metrics

for corr_fac in list(pd.Series(res).unique()):
    file_names = glob.glob('DataSet_Razmetra/**/**/**/**/Edition_Text.docx')

    parags = []
    labels = []

    highlighted_cnt = 0

    for f_name in tqdm(file_names):
        if f_name.split('/')[2]==corr_fac:
            try:
                doc = Document(f_name)

                # check that the text is marked up
                for paragraph in doc.paragraphs:
                    is_highlighted_text = False

                    for run in paragraph.runs:
                        if not (run.font.highlight_color == None):
                            is_highlighted_text = True
                            break

                    if is_highlighted_text:
                        break

                # if the text is marked up, we parse it into paragraphs
                if is_highlighted_text:
                    highlighted_cnt = highlighted_cnt + 1

                    for paragraph in doc.paragraphs:
                        if paragraph.text != '': 
                            is_highlighted_parag = False
                            for run in paragraph.runs:
                                if not (run.font.highlight_color == None):
                                    is_highlighted_parag = True
                                    break
                            if is_highlighted_parag:
                                parags.append(paragraph.text)
                                labels.append(f_name.split('/')[2])
            except:
                pass

    print('Найдено', len(file_names), 'файлов, из них', highlighted_cnt, 'с разметкой')

    file_names = glob.glob('DataSet_Razmetra/**/**/**/**/NC_Edition_Text.docx')

    highlighted_cnt = 0

    for f_name in tqdm(file_names):
        if f_name.split('/')[2]==corr_fac:
            try:
                doc = Document(f_name)
                for paragraph in doc.paragraphs:
                    is_highlighted_text = False
                    for run in paragraph.runs:
                        if not (run.font.highlight_color == None):
                            is_highlighted_text = True
                            break
                    if is_highlighted_text:
                        break
                if is_highlighted_text:
                    highlighted_cnt = highlighted_cnt + 1
                    for paragraph in doc.paragraphs:
                        if paragraph.text != '': 
                            is_highlighted_parag = False
                            for run in paragraph.runs:
                                if not (run.font.highlight_color == None):
                                    is_highlighted_parag = True
                                    break
                            if is_highlighted_parag:
                                parags.append(paragraph.text)
                                labels.append('None')
            except:
                pass

    print('Найдено', len(file_names), 'файлов, из них', highlighted_cnt, 'с разметкой')
    print(len(parags), len(labels))
    
    parags_1 = parags[:len(parags)//2]
    parags_2 = parags[len(parags)//2:]
    labels_1 = labels[:len(labels)//2]
    labels_2 = labels[len(labels)//2:]

    train_x = parags_1[:int(0.75*len(parags_1))] + parags_2[:int(0.75*len(parags_2))]
    train_y = labels_1[:int(0.75*len(labels_1))] + labels_2[:int(0.75*len(labels_2))]
    test_x = parags_1[int(0.75*len(parags_1)):] + parags_2[int(0.75*len(parags_2)):]
    test_y = labels_1[int(0.75*len(labels_1)):] + labels_2[int(0.75*len(labels_2)):]
    len(train_x), len(train_y), len(test_x), len(test_y)

    data_train = pd.DataFrame({'parag': train_x,
                              'label': train_y}).drop_duplicates()

    data_test = pd.DataFrame({'parag': test_x,
                              'label': test_y}).drop_duplicates()

    vectorizer = TfidfVectorizer(min_df=5, max_df=100000, ngram_range=(1,5))
    features_train = vectorizer.fit_transform(data_train.parag)
    print(features_train.shape)

    features_test = vectorizer.transform(data_test.parag)

    clf = CatBoostClassifier(n_estimators=100,
                            eval_metric='F1')
    clf.fit(features_train, data_train.label, 
            eval_set=(features_test, data_test.label))
    y_test_pred = clf.predict(features_test)

    print(corr_fac)
    print(classification_report(data_test.label, y_test_pred))

In [None]:
# repeat the same for the selected factors
# save the models

best_fac = ['4_1', '3_9', '3_5', '4_2', '4_3']

models = {}

for corr_fac in best_fac:
    file_names = glob.glob('DataSet_Razmetra/**/**/**/**/Edition_Text.docx')

    parags = []
    labels = []

    highlighted_cnt = 0

    for f_name in tqdm(file_names):
        if f_name.split('/')[2]==corr_fac:
            try:
                doc = Document(f_name)
                for paragraph in doc.paragraphs:
                    is_highlighted_text = False

                    for run in paragraph.runs:
                        if not (run.font.highlight_color == None):
                            is_highlighted_text = True
                            break

                    if is_highlighted_text:
                        break
                if is_highlighted_text:
                    highlighted_cnt = highlighted_cnt + 1

                    for paragraph in doc.paragraphs:
                        if paragraph.text != '': 
                            is_highlighted_parag = False
                            for run in paragraph.runs:
                                if not (run.font.highlight_color == None):
                                    is_highlighted_parag = True
                                    break
                            if is_highlighted_parag:
                                parags.append(paragraph.text)
                                labels.append(f_name.split('/')[2])
            except:
                pass

    print('Найдено', len(file_names), 'файлов, из них', highlighted_cnt, 'с разметкой')

    file_names = glob.glob('DataSet_Razmetra/**/**/**/**/NC_Edition_Text.docx')

    highlighted_cnt = 0

    for f_name in tqdm(file_names):
        if f_name.split('/')[2]==corr_fac:
            try:
                doc = Document(f_name)
                for paragraph in doc.paragraphs:
                    is_highlighted_text = False

                    for run in paragraph.runs:
                        if not (run.font.highlight_color == None):
                            is_highlighted_text = True
                            break
                    if is_highlighted_text:
                        break
                if is_highlighted_text:
                    highlighted_cnt = highlighted_cnt + 1

                    for paragraph in doc.paragraphs:
                        if paragraph.text != '': 
                            is_highlighted_parag = False
                            for run in paragraph.runs:
                                if not (run.font.highlight_color == None):
                                    is_highlighted_parag = True
                                    break
                            if is_highlighted_parag:
                                parags.append(paragraph.text)
                                labels.append('None')
            except:
                pass

    print('Найдено', len(file_names), 'файлов, из них', highlighted_cnt, 'с разметкой')
    print(len(parags), len(labels))
    
    parags_1 = parags[:len(parags)//2]
    parags_2 = parags[len(parags)//2:]
    labels_1 = labels[:len(labels)//2]
    labels_2 = labels[len(labels)//2:]

    train_x = parags_1[:int(0.75*len(parags_1))] + parags_2[:int(0.75*len(parags_2))]
    train_y = labels_1[:int(0.75*len(labels_1))] + labels_2[:int(0.75*len(labels_2))]
    test_x = parags_1[int(0.75*len(parags_1)):] + parags_2[int(0.75*len(parags_2)):]
    test_y = labels_1[int(0.75*len(labels_1)):] + labels_2[int(0.75*len(labels_2)):]
    len(train_x), len(train_y), len(test_x), len(test_y)

    data_train = pd.DataFrame({'parag': train_x,
                              'label': train_y}).drop_duplicates()

    data_test = pd.DataFrame({'parag': test_x,
                              'label': test_y}).drop_duplicates()

    vectorizer = TfidfVectorizer(min_df=5, max_df=100000, ngram_range=(1,5))
    features_train = vectorizer.fit_transform(data_train.parag)
    print(features_train.shape)

    features_test = vectorizer.transform(data_test.parag)

    clf = CatBoostClassifier(n_estimators=100,
                            eval_metric='F1',)
    clf.fit(features_train, data_train.label, 
            eval_set=(features_test, data_test.label))
    y_test_pred = clf.predict(features_test)

    models[corr_fac] = (vectorizer, clf)

    print(corr_fac)
    print(classification_report(data_test.label, y_test_pred))

In [None]:
import pickle
with open('models.pickle', 'wb') as f:
    pickle.dump(models, f)

Try predict


In [None]:
import pickle

# models - dictionary of tuples (vectorizer, cat)
with open('models.pickle', 'rb') as f:
    models = pickle.load(f)

best_fac = ['3_9', '3_5', '4_3']
TRESHOLD = 0.75

f_name = 'DataSet_Razmetra/Республика Коми/3_7/A631C9EF-FA79-4453-BAF2-129A55551695/Edition_7/NC_Edition_Text.docx'
doc = Document(f_name)

for paragraph in doc.paragraphs:    

    indicator = False
    # check the paragraph with models
    for fac in best_fac:  
        par = paragraph.text
        vectorizer, clf = models[fac]
        if clf.predict_proba(vectorizer.transform([par]))[0][1]>TRESHOLD:
            indicator = True
            break
    # color the text
    if indicator:
        print(paragraph.text)