In [9]:
import tika
tika.initVM()

In [50]:
import os
import json
import re

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier

from razdel import tokenize
from pymorphy2 import MorphAnalyzer


# import fitz
# import docx2txt

# Чтение файлов

In [11]:
from tika import parser

def get_document_text(filepath):
    doc_type = filepath.split('.')[-1]
    if doc_type in ['doc', 'rtf', 'pdf', 'docx']:
        text = parser.from_file(filepath)['content']
    else:
        print(f'File extention must be either pdf, rtf, doc or docx, got {doc_type}')
    return text.strip()

In [12]:
datadir = '../data/xmas/docs/'

In [13]:
%%time

filename2text = {}

for filename in os.listdir(datadir):
    datapath = os.path.join(datadir, filename)
    try:
        filename2text[filename] = get_document_text(datapath)
    except:
        continue

File extention must be either pdf, rtf, doc or docx, got ipynb_checkpoints
Wall time: 4min 7s


In [14]:
data = pd.DataFrame({i: {'text': v, 'filename': k} for i, (k, v) in enumerate(filename2text.items())}).T
data.head()

Unnamed: 0,text,filename
0,ДОГОВОР № ______\n\nДОГОВОР\n\nг. Москва\n«___...,02682d726b725f95b9ee85f751c043d0.doc
1,ДОГОВОР \n\n№ ______ от «___» ___________ ____...,03a70fe60be9ecc2a63798a361fc9689.docx
2,ДОГОВОР КУПЛИ-ПРОДАЖИ КВАРТИРЫ\n\n\nДОГОВОР \n...,073a0d372820c3c2bffe9ba24a7ed7af.doc
3,Договор аренды земельного участка\n\n2\n5\n\nД...,084edc7dfc3db04e3a3c55c102f47bcb.doc
4,(Типовая форма) \n\nДОГОВОР КУПЛИ-ПРОДАЖИ №___...,086ad48895d2a73854b6151decc28800.pdf


In [15]:
with open('../data/xmas/classes.json', 'r', encoding='utf-8') as f:
    markup = json.load(f)

In [16]:
data['target'] = data['filename'].apply(lambda x: markup[x].split('/')[1])

In [17]:
data['lower_text'] = data['text'].apply(lambda x: x.lower())

In [18]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['target'])

In [19]:
data.head()

Unnamed: 0,text,filename,target,lower_text,label
0,ДОГОВОР № ______\n\nДОГОВОР\n\nг. Москва\n«___...,02682d726b725f95b9ee85f751c043d0.doc,Договоры оказания услуг,договор № ______\n\nдоговор\n\nг. москва\n«___...,2
1,ДОГОВОР \n\n№ ______ от «___» ___________ ____...,03a70fe60be9ecc2a63798a361fc9689.docx,Договоры подряда,договор \n\n№ ______ от «___» ___________ ____...,3
2,ДОГОВОР КУПЛИ-ПРОДАЖИ КВАРТИРЫ\n\n\nДОГОВОР \n...,073a0d372820c3c2bffe9ba24a7ed7af.doc,Договоры купли-продажи,договор купли-продажи квартиры\n\n\nдоговор \n...,1
3,Договор аренды земельного участка\n\n2\n5\n\nД...,084edc7dfc3db04e3a3c55c102f47bcb.doc,Договоры аренды,договор аренды земельного участка\n\n2\n5\n\nд...,0
4,(Типовая форма) \n\nДОГОВОР КУПЛИ-ПРОДАЖИ №___...,086ad48895d2a73854b6151decc28800.pdf,Договоры купли-продажи,(типовая форма) \n\nдоговор купли-продажи №___...,1


# Моделирование

In [21]:
rand_state = 21

In [22]:
pipe = Pipeline(steps=[('vect', CountVectorizer()),
                        ('est', LogisticRegression(random_state=rand_state))])

In [23]:
skf = StratifiedKFold()

In [24]:
for train_index, test_index in skf.split(X=data['lower_text'].values, y=data['label'].values):
    X_train, y_train = data['lower_text'].values[train_index], data['label'].values[train_index]
    X_test, y_test = data['lower_text'].values[test_index], data['label'].values[test_index]
    pipe = Pipeline(steps=[('vect', CountVectorizer()),
                            ('est', LogisticRegression(random_state=rand_state))])
    pipe.fit(X=X_train, y=y_train)
    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         5
           2       0.88      1.00      0.93         7
           3       1.00      0.75      0.86         4
           4       1.00      1.00      1.00         4

    accuracy                           0.96        24
   macro avg       0.97      0.95      0.96        24
weighted avg       0.96      0.96      0.96        24



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         4
           2       1.00      0.86      0.92         7
           3       0.80      1.00      0.89         4
           4       1.00      1.00      1.00         4

    accuracy                           0.96        24
   macro avg       0.96      0.97      0.96        24
weighted avg       0.97      0.96      0.96        24



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         4

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.80      0.89         5
           2       1.00      1.00      1.00         6
           3       0.80      1.00      0.89         4
           4       1.00      1.00      1.00         4

    accuracy                           0.96        24
   macro avg       0.96      0.96      0.96        24
weighted avg       0.97      0.96      0.96        24

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.83      1.00      0.91         5
           2       0.86      1.00      0.92         6
           3       1.00      0.50      0.67         4
           4       1.00      1.00      1.00         5

    accuracy                           0.92        24
   macro avg       0.94      0.90      0.90        24
weighted avg       0.93      0.92      0.91        24



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Интерпретация

In [40]:
import eli5

In [51]:
vec=CountVectorizer()
est=LogisticRegression(random_state=rand_state)

pipe = make_pipeline(vec, est)
pipe.fit(X=X_train, y=y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression(random_state=21))])

In [116]:
expl = eli5.ipython.explain_prediction(est, X_test[0], vec=vec, 
                                       target_names=le.classes_, 
                                       targets=[le.classes_[-1]], top=5)



In [117]:
dict_expl = eli5.format_as_dict(eli5.ipython.explain_prediction(est, X_test[1], vec=vec,
                                target_names=le.classes_, targets=[le.classes_[1]], top=5))



# Обертка

In [15]:
pipe.fit(X=data['lower_text'].values, y=data['label'].values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('vect', CountVectorizer()),
                ('est', LogisticRegression(random_state=21))])

In [16]:
import pickle

In [17]:
with open('baseline.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [1]:
import tika
tika.initVM()
from tika import parser
import pickle

In [2]:
class Predictor(object):
    def __init__(self, model_path: str):
        self.id2label = {0: 'Договоры аренды',
                         1: 'Договоры купли-продажи',
                         2: 'Договоры оказания услуг',
                         3: 'Договоры подряда',
                         4: 'Договоры поставки'}
        with open('baseline.pkl', 'rb') as f:
            self.model = pickle.load(f)
            
    def predict_from_file(self, filepath: str) -> str:
        text = Predictor._get_document_text(filepath)
        return self.id2label[self.model.predict([text])[0]]
        
    @staticmethod
    def _get_document_text(filepath: str) -> str:
        doc_type = filepath.split('.')[-1]
        if doc_type in ['doc', 'rtf', 'pdf', 'docx']:
            text = parser.from_file(filepath)['content']
        else:
            raise NotImplementedError(f'File extention must be either pdf, rtf, doc or docx, got {doc_type}')
        return text.strip().lower()

In [3]:
pred = Predictor('baseline.pkl')

In [4]:
pred.predict_from_file('../data/xmas/docs/02682d726b725f95b9ee85f751c043d0.doc')

'Договоры оказания услуг'