# Demo

###### Imports

In [1]:
# This Python file uses the following encoding: utf-8
from lib.predict_pdf import predict_documents_pdf, predict_documents_str_we, predictions_dataframe, load_prediction_models, predict_documents_pdf_we, load_models, load_models_we
from lib import predict_pdf as pp
from lib.utils import move_flattened_files
from lib.untar import ExtractNested
from functools import partial
from lib.utils import download_from_storage_if_not_present
from lib import extract_parts as ep
import pandas as pd
import json
import os


Using TensorFlow backend.


###### Input arguments

In [2]:
pdf_folder = '../files_to_predict'
csv_out_file = '../predictions2.csv'

do_ocr = False
extract_from_tar = False
tar_root = '../prova.tar.gz'
use_a05_model = True
use_we_model = True #overrides use_a05_model

###### Tar extraction (execute only the first time if starting from a tar)

In [3]:
# Accetto solo file con dimensione < 1MB e che finiscono per 001.pdf (si può cambiare a piacere il filtro qui)
def file_filter(f):
    return str(f).endswith(u'001.pdf') and os.path.getsize(str(f))<(1024**2)

In [4]:
# Solo se si deve partire da una tar e non si è già estratto
if extract_from_tar and not os.path.exists(tar_root[:-7]):
    ExtractNested(tar_root)
    move_flattened_files(tar_root[:-7], pdf_folder, file_filter)

###### Load models

In [5]:
# Download resources if not found
with open("gs_resource_map.json") as f:
    gs_map = json.load(f)

for k,v in gs_map.items():
    download_from_storage_if_not_present("infocamere-poc", v, k)

In [6]:
models_original = {'gensim_file':'../models/gensim_model_5000.d2v', 
                  'keras_model_file':'../models/keras_model.json',
                  'keras_weights_file':'../models/keras_weights_5000.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words.json'}

model_with_a05 = {'gensim_file':'../models/gensim_5000_model_with_verb.d2v', 
                  'keras_model_file':'../models/keras_model_retry3.json',
                  'keras_weights_file':'../models/keras_weights_verb_retry3.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words_with_verb_cost.json'}

model_we = {'keras_model_file':'../models/keras_model_word_embedding.json',
            'keras_weights_file':'../models/keras_weights_word_embedding.h5',
            'reduced_dictionary_file':'../dictionaries/reduced_dictionary_cost.json'}

if use_we_model:
    prediction_fn = pp.predict_document_str_we
    loaded_models = load_models_we(**model_we)
    predict_pdfs = partial(predict_documents_pdf_we, **loaded_models)
else:
    prediction_fn = pp.predict_document_str
    models_demo = model_with_a05 if use_a05_model else models_original
    loaded_models = load_models(**models_demo)
    predict_pdfs = partial(predict_documents_pdf, **loaded_models)
    #predict_pdfs = load_prediction_models(**models_demo)

### Prediction

In [7]:
pdf_names = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith("pdf")]
#pdf_names = ['../files_to_predict/4908476390001.pdf']
predictions, filtered_filenames = predict_pdfs(pdf_names, do_ocr=do_ocr)
#%lprun -f predict_documents_pdf_we predict_documents_pdf_we(pdf_names, do_ocr=do_ocr, **loaded_models)
#%timeit predict_pdfs(pdf_names, do_ocr=do_ocr)

#### Save predictions to csv

In [8]:
def basenames(files):
    return [os.path.basename(f) for f in files]

In [9]:
df = predictions_dataframe(basenames(pdf_names), basenames(filtered_filenames), predictions, csv_out_file)
#df.to_csv(csv_out_file,  sep=',', index=False, encoding='utf-8')

In [10]:
df.sort_values("Nome file")

Unnamed: 0,Nome file,Errore,Messaggio errore,Output rete,Predizione
11,4916506210001.pdf,No,,0.926951,COSTITUTIVO
0,4950726610001.pdf,No,,0.989937,COSTITUTIVO
14,4960295080001.pdf,No,,0.987135,COSTITUTIVO
10,4963632530001.pdf,No,,0.987968,COSTITUTIVO
1,4966811780001.pdf,No,,0.922097,COSTITUTIVO
19,4984958010001.pdf,Si,scansione,,
15,4993074850001.pdf,No,,0.965589,COSTITUTIVO
2,5005850400001.pdf,No,,0.991594,COSTITUTIVO
7,5049727800001.pdf,No,,0.988253,COSTITUTIVO
13,5061668290001.pdf,No,,0.981479,COSTITUTIVO


### Sentence Extraction

In [11]:
extraction_models = {
    'keras_model_filename':'../models/extraction_model_30_all.json',
    'keras_weights_filename':'../models/extraction_weights_30_all.h5',
    'reduced_dict_filename':'../dictionaries/first_5000_words_extraction.json'
}

pe = ep.PartsExtraction.load_from_files(**extraction_models)
#name_extractor = ep.NotaioNameExtractor.load_from_file()
#extractor = ep.PredictorExtractor(prediction_fn, loaded_models, pe, name_extractor)

In [12]:
data = pe.extract_parts_pdf(pdf_names[19])
#data = extractor.extract_parts_pdf(pdf_names[19])

In [13]:
extracted = pd.DataFrame(data, columns=["frase", "classe", "poteri", "assemblea", "clausola", "non_riconducibile"])
extracted

Unnamed: 0,frase,classe,poteri,assemblea,clausola,non_riconducibile
0,"ALLEGATO ""A"" ALL'ATTO N 32 461/11 872 STATUTO ...",non_riconducibile,0.000038,0.000009,0.000006,0.999946
1,Art 2 Oggetto sociale ART 4 - La società ha p...,non_riconducibile,0.000228,0.000145,0.001703,0.997923
2,- l esercizio di attività di consulenza e di ...,non_riconducibile,0.000090,0.000043,0.000034,0.999833
3,"- lo studio, la realizzazione e l erogazione ...",non_riconducibile,0.000059,0.000017,0.000012,0.999911
4,- lo svolgimento di tutte le attività di assi...,non_riconducibile,0.000207,0.000048,0.000040,0.999705
5,lo svolgimento di ordinaria consulenza e gest...,non_riconducibile,0.001294,0.000302,0.000149,0.998255
6,- lo studio e l organizzazione di progetti ed...,non_riconducibile,0.000701,0.000085,0.000056,0.999158
7,- l assistenza alle imprese nei rapporti con ...,non_riconducibile,0.000288,0.000165,0.000225,0.999322
8,- la prestazione di servizi diretti ad Enti p...,non_riconducibile,0.000190,0.000054,0.000034,0.999722
9,la diffusione anche attraverso la vendita per...,non_riconducibile,0.000283,0.000068,0.000047,0.999601


In [14]:
for i,f in enumerate(extracted["frase"]):
    print i, f

0 ALLEGATO "A" ALL'ATTO N 32 461/11 872 STATUTO DENOMINAZIONE - OGGETTO - SEDE DURATA Art 1 Denominazione E' costituita una società a responsabilità limitata denominata "INTERNATIONAL TRADE AND CONSULTING S R L ", con sigla abbreviata "ITAC SRL"
1  Art 2 Oggetto sociale ART 4 - La società ha per oggetto: - l esercizio di intermediazioni sotto qualsiasi forma in ambito assicurativo e commerciale con esclusione della intermediazione mobiliare nei confronti del pubblico nonché relativamente ai beni prodotti o servizi direttamente commercializzati dalla società
2  - l esercizio di attività di consulenza e di supporto alle aziende ed ai privati nelle aree assicurativa, commerciale, logistica e dell organizzazione, come pure ogni altra attività affine, accessoria e complementare
3  - lo studio, la realizzazione e l erogazione di servizi connessi alla produzione, sviluppo e gestione di strutture aggregate e non, destinate all esercizio di attività commerciale e/o professionali
4  - lo svolgim