In [1]:
from __future__ import print_function
from __future__ import division
from keras.models import model_from_json
from keras.preprocessing import sequence
from lib import words as wd
from lib import embedding as em
from lib import text_extraction as te
from lib import predict_pdf as pp
from lib import extract_parts as ep
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import glob
import json
import os
import re

matplotlib.style.use('ggplot')
%matplotlib inline

Using TensorFlow backend.


In [2]:
labels = ['poteri', 'assemblea', 'clausola', 'non_riconducibile', 'scadenza']

pe = ep.PartsExtraction.load_from_files('../models/extraction_model_30_all.json',
                                     '../models/extraction_weights_30_all.h5',
                                     '../dictionaries/first_5000_words_extraction.json')

In [3]:
document = '../extraction/esempi_descrizioni/5115612230001.pdf'#'../extraction/files_to_label/4907913200001.pdf'

txt = te.extract_text(document, do_ocr=False, pages=-1)
sentences = wd.sentences_doc(txt, rep=' ', newline=True)

In [4]:
predictions = pe.extract_parts(sentences, post_process=False)
#list(enumerate(predictions))

In [5]:
predictions = pe.extract_parts(sentences, post_process=True)
#list(enumerate(predictions))

In [6]:
for i,(p,s) in enumerate(zip(predictions, sentences)):
    print('\n[{}] PREDICTION: {}\n'.format(i, p))
    print(s.strip())
    


[0] PREDICTION: non_riconducibile

Repertorio n 9116 Raccolta n 4684 COSTITUZIONE DI SOCIETA' A RESPONSABILITA' LIMITATA Repubblica Italiana L'anno duemiladiciassette il giorno sei del mese di luglio in Pistoia e nel mio studio in Corso Silvano Fedi numero 55

[1] PREDICTION: non_riconducibile

Avanti a me Dottor Nicola Ottavi, Notaio in Pistoia, iscritto al Ruolo del Collegio dei Distretti Notarili Riuniti di Firenze Pistoia e Prato, senza l'assistenza dei testimoni, è presente il signor: - BONET MASSIMO, nato a Perugia (PG) il 9 settembre 1966, residente in Perugia (PG), via Francesca Giostrelli n 10, codice fiscale: BNT MSM 66P09 G478T

[2] PREDICTION: non_riconducibile

Detto comparente, cittadino italiano, della cui identità personale, io Notaio sono certo, conviene e stipula quanto segue: A) Viene costituita una società a responsabilità limitata sotto la denominazione sociale "B io srl" società unipersonale

[3] PREDICTION: non_riconducibile

B) La società ha sede nel Comune di 

In [None]:
d = pe.extract_parts_dict(sentences, predictions)

In [None]:
for k,v in d.items():
    if k != 'non_riconducibile':
        print('\n\n'+k.upper().center(127,'.')+'\n')
        print('.\n\n'.join([s.strip() for s in v]))

In [None]:
probas = pe.extract_parts_prob(sentences)
predictions = pe.extract_parts(sentences, post_process=True, probas=probas)
dict_indexes = pe.extract_parts_dict_indexes(predictions)
dict_sentences = ep.dict_indexes_to_sentences(sentences, dict_indexes)

In [None]:
for k,v in dict_sentences.items():
    if k != 'non_riconducibile':
        print('\n\n'+k.upper().center(127,'.')+'\n')
        print('.\n\n'.join([s.strip() for s in v]))

## Threshold sensato

In [None]:
def get_thres_nl(txt):
    t = 1
    while is_valid_nl(txt, t):
        t -= 0.001
    return t

In [None]:
txt_brutto = open('../extraction/esempi_extracted/5116982390001.txt').read()
print(get_thres_nl(txt_brutto))

In [None]:
for i,f in enumerate(glob.glob('../atti_costitutivi_txt/*')[:100]):
    text = open(f).read()
    print(f, get_thres_nl(text))

In [None]:
thress = [get_thres_nl(open(f).read()) for f in  glob.glob('../atti_costitutivi_txt/*')]

In [None]:
plt.hist(thress, bins=50)

In [None]:
for e in pe.extract_parts_prob(sentences):
    print('{:4f}\t{:4f}\t{:4f}\t{:4f}'.format(*e))

## All together

In [2]:
models = pp.load_models()
name_extractor = ep.NotaioNameExtractor.load_from_file()
pe = ep.PartsExtraction.load_from_files('../models/extraction_model_30_all.json',
                                     '../models/extraction_weights_30_all.h5',
                                     '../dictionaries/first_5000_words_extraction.json')

pred_extract = ep.PredictorExtractor(models, pe, name_extractor)

In [3]:
document = '../extraction/esempi_descrizioni/5115612230001.pdf'
pred_extract.predict_extract_pdf_json(document)

'{"confidenza": 0.9985241293907166, "sensato": true, "classe": "costitutivo", "nome notaio": "Nicola Ottavi", "parti": {"clausola": [4, 52, 53, 54, 55, 56, 57, 59, 60, 129, 136], "assemblea": [63, 68, 69, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 110, 122, 128, 137], "scadenza": [25], "poteri": [20, 21, 22, 23, 24, 26, 43, 49, 58, 66, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 101, 102, 111, 112, 113, 114, 115, 116, 117, 118, 119, 123], "non_riconducibile": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 61, 62, 64, 65, 67, 70, 77, 95, 96, 97, 98, 99, 100, 103, 104, 105, 106, 107, 108, 109, 120, 121, 124, 125, 126, 127, 130, 131, 132, 133, 134, 135, 138, 139, 140, 141, 142, 143, 144]}, "frasi": [{"frase": "Repertorio n 9116 Raccolta n 4684 COSTITUZIONE DI SOCIETA\' A RESPONSABILITA\' LIMITATA Repubblica Italiana L\'anno duemiladiciassette il giorno sei del mese di lugl

In [None]:
os.remove('../test_upload/uuuuu.pdf')

In [6]:
pred_extract.predict_extract_pdf_json('../test_upload/T3LAB-INJENIA_Analisi Preliminare.pdf')

'{"confidenza": 0.004562266170978546, "classe": "non costitutivo"}'