In [61]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import re

import spacy

from tika import parser

nlp = spacy.load("ru_core_news_sm")


In [18]:
data = pd.read_csv("./train.csv")


### Чистим и разбиваем данные

In [3]:
def split_to_paragraphs(text: str) -> list:
    return re.split(r"\n *\d\.*\d*\.*\d*\.*\d*\.*", text)


def create_df_paragraphs(df: pd.DataFrame) -> pd.DataFrame:
    paragraphs = []
    target = []
    for i, text in enumerate(data.text):
        tmp = split_to_paragraphs(text)
        paragraphs += tmp
        for j in tmp:
            target.append(data.target[i])

    data_paragraps = pd.DataFrame({"text": paragraphs, "target": target})
    return data_paragraps


In [4]:
data_p = create_df_paragraphs(data)


In [5]:
def delete_rubbish(text: str) -> str:
    doc = nlp(text)
    res = [x.text for x in doc if not x.is_punct and not x.is_space]
    return " ".join(res).lower()


def delete_rubbish_and_lemma(text: str) -> str:
    doc = nlp(text)
    res = [x.lemma_ for x in doc if not x.is_punct and not x.is_space]
    return " ".join(res).lower()


In [None]:
data_clean = data_p.copy()
data_clean.text = data_p.text.apply(delete_rubbish)


In [90]:
data_clean = data_clean[data_clean.text != ""]


In [92]:
data_clean.text.iloc[65]


'договор № от г. г. именуемое в дальнейшем подрядчик в лице действующего на основании и именуемое в дальнейшем заказчик в лице действующего на основании заключили настоящий договор о нижеследующем'

data_clean_lem = data_p.copy()
data_clean_lem.text = data_p.text.apply(delete_rubbish)

In [93]:
data_clean.to_csv("./data_paragraph.csv", index=False)
# data_clean_lem.to_csv("./data_paragraph_lem.csv", index = False)


### Получение эмбеддингов фраз

#### sentence-transformers/paraphrase-multilingual-mpnet-base-v2

In [19]:
data = pd.read_csv("./data_paragraph.csv")


In [8]:
data.head()


Unnamed: 0,text,target
0,договор № договор г. москва 2020 года гр рф го...,Договоры для акселератора/Договоры оказания услуг
1,предмет договора,Договоры для акселератора/Договоры оказания услуг
2,заказчик поручает а исполнитель обязуется на у...,Договоры для акселератора/Договоры оказания услуг
3,заказчик обязуется оплатить услуги исполнителя...,Договоры для акселератора/Договоры оказания услуг
4,исполнитель вправе совершать действия направле...,Договоры для акселератора/Договоры оказания услуг


In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)


In [21]:
embeddings = []
for text in tqdm(data.text):
    embeddings.append(model.encode(text))


100%|██████████| 7137/7137 [04:02<00:00, 29.43it/s]


In [13]:
assert len(embeddings) == len(data.text)


#### Тренируем лог регрессию

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
from sklearn.linear_model import LogisticRegression
import optuna


In [24]:
tmp = pd.DataFrame(embeddings)
tmp["target"] = data["target"]


In [12]:
from tabnanny import verbose


def objective(trial):
    global tmp
    X = tmp.drop("target", axis=1)
    y = tmp["target"]
    param_grid = {
        "max_iter": trial.suggest_int("max_iter", 100, 500),
        "C": trial.suggest_float("C", 2.0, 5.0),
        "tol": trial.suggest_uniform("tol", 1e-6, 1e-3),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=11)
    cv_predicts = np.empty(5)
    for idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        lr = LogisticRegression(
            penalty="l1",
            random_state=11,
            solver="saga",
            multi_class="multinomial",
            n_jobs=-1,
            **param_grid,
            verbose=0
        )
        lr.fit(x_train, y_train)
        preds = lr.predict(x_val)
        cv_predicts[idx] = f1_score(y_val, preds, average="macro")
    return np.mean(cv_predicts)


In [236]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, n_jobs=20)
print(study.best_trial)


[32m[I 2022-12-16 12:32:06,642][0m A new study created in memory with name: no-name-1575a720-5c3d-43b2-844f-e000f27b1081[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge

FrozenTrial(number=17, values=[0.6853288214879836], datetime_start=datetime.datetime(2022, 12, 16, 12, 32, 6, 849276), datetime_complete=datetime.datetime(2022, 12, 16, 12, 38, 7, 832193), params={'max_iter': 473, 'C': 4.185962106001983, 'tol': 0.00010234177020776471}, distributions={'max_iter': IntUniformDistribution(high=500, low=100, step=1), 'C': UniformDistribution(high=5.0, low=2.0), 'tol': UniformDistribution(high=0.001, low=1e-06)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=17, state=TrialState.COMPLETE, value=None)


In [25]:
params = dict(
    penalty="l1",
    tol=0.000312,
    C=4.888384842733068,
    random_state=11,
    solver="saga",
    max_iter=492,
    multi_class="multinomial",
    verbose=1,
    n_jobs=-1,
)

lr = LogisticRegression(**params)

kf = KFold(n_splits=5, shuffle=True, random_state=11)

total_acc, total_f1, total_precision, total_recall = 0, 0, 0, 0


for train_idx, val_idx in tqdm(kf.split(tmp)):
    y_train, y_val = tmp["target"].loc[train_idx], tmp["target"].loc[val_idx]
    x_train, x_val = tmp.loc[train_idx], tmp.loc[val_idx]
    x_train.drop("target", axis=1, inplace=True)
    x_val.drop("target", axis=1, inplace=True)

    lr.fit(x_train, y_train)
    predicted = lr.predict(x_val)
    acc = accuracy_score(y_val, predicted)
    total_acc += acc
    f1 = f1_score(y_val, predicted, average="macro")
    total_f1 += f1
    precision = precision_score(y_val, predicted, average="macro")
    total_precision += precision
    recall = recall_score(y_val, predicted, average="macro")
    total_recall += recall
    print(f"[epoch] acc: {acc}, f1: {f1}, precision: {precision}, recall: {recall}")
print(
    f"[mean] acc: {round(total_acc/5, 5)}, f1: {round(total_f1/5, 5)},\
        precision: {round(total_precision/5, 5)}, recall: {round(total_recall/5, 5)}"
)


0it [00:00, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.29286200
Epoch 3, change: 0.17026153
Epoch 4, change: 0.11992735
Epoch 5, change: 0.08792420
Epoch 6, change: 0.07566236
Epoch 7, change: 0.05539123
Epoch 8, change: 0.05202252
Epoch 9, change: 0.04533719
Epoch 10, change: 0.04019618
Epoch 11, change: 0.03982613
Epoch 12, change: 0.03554304
Epoch 13, change: 0.03403800
Epoch 14, change: 0.03149009
Epoch 15, change: 0.02979030
Epoch 16, change: 0.02846844
Epoch 17, change: 0.02687029
Epoch 18, change: 0.02518490
Epoch 19, change: 0.02397186
Epoch 20, change: 0.02346643
Epoch 21, change: 0.02221876
Epoch 22, change: 0.02149194
Epoch 23, change: 0.02104384
Epoch 24, change: 0.02025161
Epoch 25, change: 0.01956769
Epoch 26, change: 0.01888189
Epoch 27, change: 0.01840920
Epoch 28, change: 0.01792373
Epoch 29, change: 0.01758609
Epoch 30, change: 0.01706440
Epoch 31, change: 0.01660526
Epoch 32, change: 0.01653442
Epoch 33, change: 0.01600605
Epoch 34, change: 0.01545430
Epoch 35, change: 0.015

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   54.1s finished
1it [00:54, 54.19s/it]

Epoch 491, change: 0.00039460
max_iter reached after 54 secondsEpoch 492, change: 0.00039376

[epoch] acc: 0.688375350140056, f1: 0.6838195304617397, precision: 0.6945264095279178, recall: 0.6781761724153682
[mean] acc: 0.13768, f1: 0.13676,        precision: 0.13891, recall: 0.13564





### Собираем финальную модель

In [54]:
from typing import Tuple
from io import BytesIO
import os
import argparse
import re
import shutil
import pdfkit
import fitz


def search_for_text(lines, search_str):
    """
    Search for the search string within the document lines
    """
    for line in lines:
        # Find all matches within one line
        results = re.findall(search_str, line, flags = re.IGNORECASE)
        # In case multiple matches within one line
        for result in results:
            yield result

def highlight_matching_data(page, matched_values):
    """
    Highlight matching values
    """
    matches_found = 0
    # Loop throughout matching values
    for val in matched_values:
        matches_found += 1
        matching_val_area = page.searchFor(val)
        # print("matching_val_area",matching_val_area)
        highlight = None

        highlight = page.addHighlightAnnot(matching_val_area)
        
        highlight.update()
    return matches_found

def highlight_matching_data(page, matched_values, color):
    """
    Highlight matching values
    """
    matches_found = 0
    # Loop throughout matching values
    for val in matched_values:
        if len(val) < 15 or len(val.split()) < 3: 
            continue
        matches_found += 1
        matching_val_area = page.searchFor(val)
        # print("matching_val_area",matching_val_area)
        highlight = None

        highlight = page.addHighlightAnnot(matching_val_area)

        if color == "main":
            highlight.setColors({"stroke":(0.66, 0.92, 0.70)})
        elif color == "secondааааа":
            highlight.setColors({"stroke":(0.61, 0.87, 0.92)})
        elif color == "warn" or color == "second":
            highlight.setColors({"stroke":(0.96, 0.75, 0.77)})  
        
        highlight.update()
    return matches_found

def process_data(input_file: str, output_file: str, search_str: str, color: str):
    pdfDoc = fitz.open(input_file)
    output_buffer = BytesIO()
    total_matches = 0

    for pg in range(pdfDoc.pageCount):

        page = pdfDoc[pg]
        page_lines = page.getText("text").split('\n')
        matched_values = search_for_text(page_lines, search_str)

        matches_found = highlight_matching_data(
                    page, matched_values, color)

        total_matches += matches_found

    print(f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}")
    # Save to output
    pdfDoc.save(output_buffer)
    pdfDoc.close()
    # Save the output buffer to the output file
    with open(output_file, mode='wb') as f:
        f.write(output_buffer.getbuffer())

In [55]:
class DocClassifier:
    def __init__(self, nlp, embed, classifier) -> None:
        self.nlp = nlp
        self.embed = embed
        self.classifier = classifier

    def get_text_from_file(self, path: str) -> str:
        parsed = parser.from_file(path)
        return parsed["content"]

    def split_to_paragraphs(self, text: str) -> list:
        return re.split(r"\n[\t ]*\d\.*\d*\.*\d*\.*\d*\.*", text)

    def delete_rubbish(self, text: str, lower=True) -> str:
        doc = self.nlp(text)
        res = [x.text for x in doc if not x.is_punct and not x.is_space]
        if lower:
            return " ".join(res).lower()
        else:
            return " ".join(res)

    def preprocess(self, text: str, lower=True) -> list:
        paragraphs = self.split_to_paragraphs(text)

        for i, line in enumerate(paragraphs):
            paragraphs[i] = self.delete_rubbish(line, lower=lower)

        return paragraphs

    def get_embeddings(self, paragraphs: list) -> list:
        embeddings = []
        for text in paragraphs:
            embeddings.append(self.embed.encode(text))

        return embeddings

    def predict_proba(self, path: str, is_path=True) -> np.array:
        if is_path:
            text = self.get_text_from_file(path)
        else:
            text = path
        paragraphs = self.preprocess(text)
        embeddings = self.get_embeddings(paragraphs)
        res = np.zeros(5)
        for emb in embeddings:
            res += self.classifier.predict_proba([emb])[0]
        return res

    def get_dict(self, path: str, is_path=True) -> dict:
        if is_path:
            text = self.get_text_from_file(path)
        else:
            text = path

        paragraphs = self.preprocess(text)
        embeddings = self.get_embeddings(paragraphs)
        #paragraphs = np.array(self.preprocess(text, lower=False))
        paragraphs = paragraphs = self.split_to_paragraphs(self.get_text_from_file(path))
        paragraphs = [i.rstrip(" ").lstrip(" ") for i in paragraphs]
        paragraphs = [i.replace("(", "\(") for i in paragraphs if "("]
        paragraphs = [i.replace(")", "\)") for i in paragraphs if ")"]
        paragraphs = np.array(paragraphs)
        res = np.zeros(5)
        probs = []
        for emb in embeddings:
            pred = self.classifier.predict_proba([emb])[0]
            probs.append(pred)
            res += pred
        probs = np.array(probs)
        res = res / np.sum(res)

        main_cat_ind = np.argmax(res)
        second_cat_ind = np.argsort(res, axis=0)[-2]

        main_cat = self.classifier.classes_[main_cat_ind]
        second_cat = self.classifier.classes_[second_cat_ind]

        main_cat_prob = res[main_cat_ind]
        second_cat_prob = res[second_cat_ind]

        main_best_sent = paragraphs[np.argsort(probs[:,main_cat_ind], axis=0)][::-1][0:min(6, int(0.4*len(paragraphs)))]
        second_best_sent = paragraphs[np.argsort(probs[:,second_cat_ind], axis=0)][::-1][0:min(6, int(0.4*len(paragraphs)))]

        sent_warnings = []
        for j, prob in enumerate(probs):
            for i, p in enumerate(prob):
                 if i != main_cat_ind and (i != second_cat_ind or main_cat_prob > 0.6) and p > 0.7:
                    sent_warnings.append(paragraphs[j])

        if main_cat_prob > 0.6:
            output = {
                "main_class": {
                    "prob": main_cat_prob,
                    "class": main_cat,
                    "top_quotes":list(main_best_sent),
                    },
                "second_class": None
                }
        else:
            output = {
                "main_class": {
                    "prob": main_cat_prob,
                    "class": main_cat,
                    "top_quotes": list(main_best_sent),
                    },
                "second_class": {
                    "prob": second_cat_prob,
                    "class": second_cat,
                    "top_quotes": list(second_best_sent),
                    }
                }

        return output
    
    def convert_to_pdf(self, path:str) -> None:
        if path.split(".")[-1] == 'pdf':
            shutil.copyfile(path, "./output.pdf")
        else:
            with open("./tmp.txt", 'w') as f:
                f.write(cl.get_text_from_file(path))
            opt = {
                    'encoding': 'UTF-8',
                    'enable-local-file-access': True
                }
            pdfkit.from_file("./tmp.txt", "./output.pdf", options=opt)
            
            

    def pdf_viz(self, path:str) -> None:
        self.convert_to_pdf(path)
        out = self.get_dict("./output.pdf")

        for text in out["main_class"]["top_quotes"]:
            lines = text.split("\n")
            lines = [i for i in lines if i != "" and i!= " "]
            print(lines)
            for line in lines:
                process_data("./output.pdf", "./output.pdf", line, color= "main")

        if out["second_class"] is not None:
            for text in out["second_class"]["top_quotes"]:
                lines = text.split("\n")
                lines = [i for i in lines if i != "" and i!= " "]
                for line in lines:
                    process_data("./output.pdf", "./output.pdf", line, color= "second")
        


### Проверяем работу

In [56]:
cl = DocClassifier(
    spacy.load("ru_core_news_sm"),
    SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2"),
    lr,
)

In [57]:
cl.get_dict("двойной-in.doc")

{'main_class': {'prob': 0.5192140025823714,
  'class': 'Договоры для акселератора/Договоры купли-продажи',
  'top_quotes': ['Продавец продает,  а  Покупатель  покупает  товары  по  следующим\n ценам:\n ------------------------------------------------------------------------\n\n ------------------------------------------------------------------------\n\n ------------------------------------------------------------------------\n',
   'Качество  товара,  представленного   Продавцом   по   настоящему\n Договору, должно соответствовать представленным образцам.']},
 'second_class': {'prob': 0.18617056774441637,
  'class': 'Договоры для акселератора/Договоры подряда',
  'top_quotes': ['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nДоговор подряда \\(образец 1\\)\n\nДоговор \n\n___________________                             "___"______________ 20  г.\n\n    Фирма "___________________", в  лице  генерального директора \\(Ф.И.О.\\),\n именуемого  в 

In [None]:
cl.pdf_viz("двойной.doc")

### Сохраняем

In [306]:
import pickle
with open("./model3.pkl", 'wb') as f:
    pickle.dump(cl, f)

# Тестирование всего алгоритма на 5 фолдах

In [245]:
def split_to_paragraphs(text: str) -> list:
    return re.split(r"\n *\d\.*\d*\.*\d*\.*\d*\.*", text)


def create_df_paragraphs(df: pd.DataFrame) -> pd.DataFrame:
    paragraphs = []
    target = []
    for i, text in enumerate(data.text):
        tmp = split_to_paragraphs(text)
        paragraphs += tmp
        for j in tmp:
            target.append(data.target[i])

    data_paragraps = pd.DataFrame({"text": paragraphs, "target": target})
    return data_paragraps


def delete_rubbish(text: str) -> str:
    doc = nlp(text)
    res = [x.text for x in doc if not x.is_punct and not x.is_space]
    return " ".join(res).lower()


In [246]:
data = pd.read_csv("./train.csv")


In [251]:
params = dict(
    penalty="l1",
    tol=0.000312,
    C=4.888384842733068,
    random_state=11,
    solver="saga",
    max_iter=492,
    multi_class="multinomial",
    verbose=1,
    n_jobs=-1,
)

model = spacy.load("ru_core_news_sm")
embed = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)



In [296]:
kf = KFold(n_splits=5, shuffle=True, random_state=11)

total_acc, total_f1, total_precision, total_recall = 0, 0, 0, 0

for train_idx, val_idx in kf.split(data):
    y_val = data["target"].loc[val_idx]
    x_val = data.loc[val_idx]
    x_val.drop("target", axis=1, inplace=True)

    train_data = create_df_paragraphs(data.loc[train_idx])
    train_data.text = train_data.text.apply(delete_rubbish)

    embeddings = []
    for text in tqdm(train_data.text):
        embeddings.append(embed.encode(text))

    x_train = pd.DataFrame(embeddings)
    y_train = train_data.target

    lr = LogisticRegression(**params)

    lr.fit(x_train, y_train)

    cl = DocClassifier(model, embed, lr)

    predicted = []
    for text in x_val.text:
        predicted.append(
            cl.classifier.classes_[np.argmax(cl.predict_proba(text, is_path=False))]
        )

    acc = accuracy_score(y_val, predicted)
    total_acc += acc
    f1 = f1_score(y_val, predicted, average="macro")
    total_f1 += f1
    precision = precision_score(y_val, predicted, average="macro")
    total_precision += precision
    recall = recall_score(y_val, predicted, average="macro")
    total_recall += recall
    print(f"[epoch] acc: {acc}, f1: {f1}, precision: {precision}, recall: {recall}")
print(
    f"[mean] acc: {round(total_acc/5, 5)}, f1: {round(total_f1/5, 5)},\
        precision: {round(total_precision/5, 5)}, recall: {round(total_recall/5, 5)}"
)


100%|██████████| 7151/7151 [04:08<00:00, 28.79it/s]

The max_iter was reached which means the coef_ did not converge



[epoch] acc: 0.9583333333333334, f1: 0.9580952380952382, precision: 0.975, recall: 0.95


100%|██████████| 7151/7151 [04:15<00:00, 27.97it/s]

The max_iter was reached which means the coef_ did not converge



[epoch] acc: 1.0, f1: 1.0, precision: 1.0, recall: 1.0


100%|██████████| 7151/7151 [04:07<00:00, 28.85it/s]

The max_iter was reached which means the coef_ did not converge



[epoch] acc: 0.9583333333333334, f1: 0.9151515151515153, precision: 0.9666666666666668, recall: 0.9


100%|██████████| 7151/7151 [04:07<00:00, 28.94it/s]

The max_iter was reached which means the coef_ did not converge



[epoch] acc: 0.9583333333333334, f1: 0.9492063492063492, precision: 0.95, recall: 0.96


100%|██████████| 7151/7151 [04:08<00:00, 28.83it/s]

The max_iter was reached which means the coef_ did not converge



[epoch] acc: 1.0, f1: 1.0, precision: 1.0, recall: 1.0
[mean] acc: 0.975, f1: 0.96449,        precision: 0.97833, recall: 0.962
