In [19]:
import PyPDF2
import textract
import pickle
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn import metrics

warnings.filterwarnings("ignore")

In [20]:
def readPdf(name):
    text = ""
    
    try:
        pdf  = PyPDF2.PdfFileReader(open(name, 'rb'))
        for i in range(pdf.numPages):
            text += pdf.getPage(i).extractText()
    except:
        print('problem with ' + name)

    if text:
        return text
    else:
        return textract.process(name, method='tesseract').decode()

In [21]:
def process(x):
    return x.lower().translate(str.maketrans('', '', '0123456789_')).encode('ascii',errors='ignore').decode()

In [22]:
def classify_document(document, clf, bd = '', expediente=''):
    if type(bd) == pd.core.frame.DataFrame and not expediente:
        print('Expediente missing!')
        return
    else:
        text = readPdf(document)
        acao = clf.predict([text])[0] 
        print(acao)
        if type(bd) == pd.core.frame.DataFrame:
            if expediente in bd.index:
                bd.loc[expediente].text += text
                print('Expediente', expediente, 'updated!')
            else:
                bd.loc[expediente] = (acao, text)

In [23]:
def retrieval_classification(expediente, bd):
    print(bd.loc[str(expediente)])

In [24]:
def correct_classification(v, column, expediente, bd):
    bd.set_value(expediente, column, v)   

In [25]:
def retrain_model(clf, bd):
    x_train, x_test, y_train, y_test = train_test_split(bd.text, bd.acao, test_size = 0.2, random_state=42)
    clf = clf.fit(x_train, y_train)
    check_accuracy(clf, bd, True)

In [26]:
def check_accuracy(clf, bd, metric=False):
    x_train, x_test, y_train, y_test = train_test_split(bd.text, bd.acao, test_size = 0.2, random_state=42)
    predictions = clf.predict(x_test)
    
    if metric:
        print(metrics.classification_report(y_test, predictions))
        print(metrics.confusion_matrix(y_test, predictions))
    else:
        print(clf.score(x_test, y_test))

In [27]:
with open('../04_machine_learning/model.pkl', "rb") as f:
    clf = pickle.load(f)

with open('../04_machine_learning/dataset.pkl', "rb") as f:
    bd = pickle.load(f)

In [28]:
check_accuracy(clf, bd)

0.7424242424242424


In [29]:
bd.head()

Unnamed: 0_level_0,acao,text
expediente,Unnamed: 1_level_1,Unnamed: 2_level_1
02.000.00030/2017,9,PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92.1...
02.000.00035/2017,50,"\n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n \..."
02.000.00136/2017,47,\n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FEDER...
02.000.00145/2017,43,03/02/2017Número: 1000055-73.2017.4.01.3200 ...
02.000.00186/2017,50,\n\nVara Unica\nSSJ Tefé\nFi.\n\nRubrica\n\n ...


In [30]:
%%time
text = readPdf('../01_source/Arquivos/02000001362017_7321216_20170210_PETICAO_INICIAL.pdf')
print(text[:1000])

, ae “8,
CAIXA ree 625
Run eB

EXCELENTISSIMO (A) SENHOR (A) DOUTOR (A) JUIZ(A) FEDERAL, DA, 3"
VARA JUIZADO ESPECIAL FEDERAL DO ESTADO DO AMAZONAS

PROCESSO: 00009635020174013200
AUTOR: AMAZONASTUR EMPRESA ESTADUAL DE TURISMO
REU: CAIXA ECONOMICA FEDERAL

2/T-ENLOOHEDT-LTOE-42-01T- Wy TR apag eT }Sh—

© A CAIXA ECONOMICA FEDERAL - CEF, instituic&o financeira
sob a forma de empresa publica, dotada de personalidade juridica de direito
privado, criada pelo Decreto-Lei n°. 759, de 12 de Agosto de 1969, regendo-se
atualmente pelo Estatuto aprovado e consolidado pelo DECRETO N°. 6.473 de 05
de Junho de 2008, com alteragao pelo Decreto n°. 6.796, de 17 de Marco de 2009,
inscrita no CGC/MF sob n°. 00.360.305/0001-04, com representagao juridica neste
Estado, sito a Av. Djalma Batista n°. 1661 Millennium Center salas 102/103 — Bairro
Chapada, CEP 69050-010, Manaus/AM, onde recebe intimagdes e as eletrénicas
em jurirmn@caixa.gov.br, vem, por intermédio de seu advogado signatario,
conforme instru

In [31]:
%%time
clf.predict([text])[0]

CPU times: user 6.89 ms, sys: 0 ns, total: 6.89 ms
Wall time: 6.15 ms


50

In [32]:
correct_classification('50', 'acao', '02.000.00136/2017',bd)
bd.head()

Unnamed: 0_level_0,acao,text
expediente,Unnamed: 1_level_1,Unnamed: 2_level_1
02.000.00030/2017,9,PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92.1...
02.000.00035/2017,50,"\n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n \..."
02.000.00136/2017,50,\n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FEDER...
02.000.00145/2017,43,03/02/2017Número: 1000055-73.2017.4.01.3200 ...
02.000.00186/2017,50,\n\nVara Unica\nSSJ Tefé\nFi.\n\nRubrica\n\n ...


In [33]:
%%time
retrain_model(clf, bd)

             precision    recall  f1-score   support

          1       1.00      1.00      1.00         2
         10       1.00      0.25      0.40         4
         14       0.00      0.00      0.00         5
         22       0.00      0.00      0.00         1
         28       0.00      0.00      0.00         2
         43       0.80      0.67      0.73         6
         47       0.44      0.53      0.48        15
         50       0.81      0.95      0.87        76
         60       0.00      0.00      0.00         1
         67       0.78      0.64      0.70        11
        227       0.67      1.00      0.80         2
        237       0.60      0.43      0.50         7

avg / total       0.70      0.75      0.71       132

[[ 2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  1  1  0  1  0  0]
 [ 0  0  0  0  0  0  1  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  1  0  0  0  0]
 [ 0  0  0  0  0  4  2  0  0  0  0  0]
 [ 0  0  0  0  0  1  