# Demo

###### Imports

In [1]:
# This Python file uses the following encoding: utf-8
from lib.predict_pdf import predict_documents_pdf, predict_documents_str_we, predictions_dataframe, load_prediction_models, predict_documents_pdf_we, load_models_we
from lib import predict_pdf as pp
from lib.utils import move_flattened_files
from lib.untar import ExtractNested
from functools import partial
from lib.utils import download_from_storage_if_not_present
from lib import extract_parts as ep
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import gridspec
import json
import os
matplotlib.style.use('ggplot')
#%load_ext line_profiler

Using TensorFlow backend.


###### Input arguments

In [2]:
pdf_folder = '../files_to_predict'
csv_out_file = '../predictions2.csv'

do_ocr = False
extract_from_tar = False
tar_root = '../prova.tar.gz'
use_a05_model = True
use_we_model = True #overrides use_a05_model

###### Tar extraction (execute only the first time if starting from a tar)

In [3]:
# Accetto solo file con dimensione < 1MB e che finiscono per 001.pdf (si può cambiare a piacere il filtro qui)
def file_filter(f):
    return str(f).endswith(u'001.pdf') and os.path.getsize(str(f))<(1024**2)

In [4]:
# Solo se si deve partire da una tar e non si è già estratto
if extract_from_tar and not os.path.exists(tar_root[:-7]):
    ExtractNested(tar_root)
    move_flattened_files(tar_root[:-7], pdf_folder, file_filter)

###### Load models

In [5]:
# Download resources if not found
with open("gs_resource_map.json") as f:
    gs_map = json.load(f)

for k,v in gs_map.items():
    download_from_storage_if_not_present("infocamere-poc", v, k)

In [6]:
models_original = {'gensim_file':'../models/gensim_model_5000.d2v', 
                  'keras_model_file':'../models/keras_model.json',
                  'keras_weights_file':'../models/keras_weights_5000.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words.json'}

model_with_a05 = {'gensim_file':'../models/gensim_5000_model_with_verb.d2v', 
                  'keras_model_file':'../models/keras_model_retry3.json',
                  'keras_weights_file':'../models/keras_weights_verb_retry3.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words_with_verb_cost.json'}

model_we = {'keras_model_file':'../models/keras_model_word_embedding.json',
            'keras_weights_file':'../models/keras_weights_word_embedding.h5',
            'reduced_dictionary_file':'../dictionaries/reduced_dictionary_cost.json'}

if use_we_model:
    prediction_fn = pp.predict_document_str_we
    loaded_models = load_models_we(**model_we)
    predict_pdfs = partial(predict_documents_pdf_we, **loaded_models)
else:
    prediction_fn = pp.predict_document_str
    models_demo = model_with_a05 if use_a05_model else models_original
    predict_pdfs = load_prediction_models(**models_demo)

### Prediction

In [7]:
pdf_names = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith("pdf")]
#pdf_names = ['../files_to_predict/4908476390001.pdf']
predictions, filtered_filenames = predict_pdfs(pdf_names, do_ocr=do_ocr)
#%lprun -f predict_documents_pdf_we predict_documents_pdf_we(pdf_names, do_ocr=do_ocr, **loaded_models)
#%timeit predict_pdfs(pdf_names, do_ocr=do_ocr)

#### Save predictions to csv

In [8]:
def basenames(files):
    return [os.path.basename(f) for f in files]

In [9]:
df = predictions_dataframe(basenames(pdf_names), basenames(filtered_filenames), predictions, csv_out_file)
#df.to_csv(csv_out_file,  sep=',', index=False, encoding='utf-8')

In [16]:
df.sort_values("Nome file")

Unnamed: 0,Nome file,Errore,Messaggio errore,Output rete,Predizione
11,4916506210001.pdf,No,,0.926951,COSTITUTIVO
0,4950726610001.pdf,No,,0.989937,COSTITUTIVO
14,4960295080001.pdf,No,,0.987135,COSTITUTIVO
10,4963632530001.pdf,No,,0.987968,COSTITUTIVO
1,4966811780001.pdf,No,,0.922097,COSTITUTIVO
19,4984958010001.pdf,Si,scansione,,
15,4993074850001.pdf,No,,0.965589,COSTITUTIVO
2,5005850400001.pdf,No,,0.991594,COSTITUTIVO
7,5049727800001.pdf,No,,0.988253,COSTITUTIVO
13,5061668290001.pdf,No,,0.981479,COSTITUTIVO


### Sentence Extraction

In [11]:
extraction_models = {
    'keras_model_filename':'../models/extraction_model_30_all.json',
    'keras_weights_filename':'../models/extraction_weights_30_all.h5',
    'reduced_dict_filename':'../dictionaries/first_5000_words_extraction.json'
}

pe = ep.PartsExtraction.load_from_files(**extraction_models)
name_extractor = ep.NotaioNameExtractor.load_from_file()
extractor = ep.PredictorExtractor(prediction_fn, loaded_models, pe, name_extractor)

In [17]:
data = extractor.extract_parts_pdf(pdf_names[0])

In [18]:
pd.DataFrame(data, columns=["frase", "classe", "poteri", "assemblea", "clausola", "non_riconducibile"])

Unnamed: 0,frase,classe,poteri,assemblea,clausola,non_riconducibile
0,"NOTAIO TOMMASO ORSINI Roma, Via Clitunno n 22/...",non_riconducibile,0.0004,5.7e-05,5.9e-05,0.999483
1,"Davanti a me Dottor TOMMASO ORSINI, Notaio re...",non_riconducibile,0.000427,0.0001,9.2e-05,0.999382
2,"2) PALMA ISOLA, nata a Roma (RM) il 7 aprile ...",non_riconducibile,0.000669,0.000112,6.3e-05,0.999156
3,"Detti comparenti, cittadini italiani, della c...",non_riconducibile,0.000222,0.000286,0.000264,0.999228
4,"Le finalità, la durata, l'organizzazione ed i...",non_riconducibile,0.000158,6.8e-05,0.000311,0.999464
5,= ARTICOLO 2 = CAPITALE SOCIALE - QUOTE Il ca...,non_riconducibile,0.000171,0.000116,8.3e-05,0.999629
6,"= PALMA ISOLA una quota di nominali Euro 100,...",non_riconducibile,0.000295,0.000255,8e-05,0.99937
7,Agli effetti dell'articolo 2464 del Codice Ci...,non_riconducibile,8.9e-05,3.6e-05,3.2e-05,0.999843
8,"- la signora PALMA ISOLA, la somma di Euro 25...",non_riconducibile,0.000135,3.4e-05,1.4e-05,0.999818
9,Quanto al residuo 75% (settantacinque per cen...,non_riconducibile,0.000864,0.000167,0.000795,0.998174
