# Demo

###### Imports

In [1]:
# This Python file uses the following encoding: utf-8
from lib.predict_pdf import predict_documents_pdf, predict_documents_str_we, predictions_dataframe, load_prediction_models, predict_documents_pdf_we, load_models_we
from lib.utils import move_flattened_files
from lib.untar import ExtractNested
from functools import partial
from lib.utils import download_from_storage_if_not_present
import json
import os

#%load_ext line_profiler

Using TensorFlow backend.


###### Input arguments

In [2]:
pdf_folder = '../files_to_predict'
csv_out_file = '../predictions2.csv'

do_ocr = False
extract_from_tar = False
tar_root = '../prova.tar.gz'
use_a05_model = True
use_we_model = True #overrides use_a05_model

###### Tar extraction (execute only the first time if starting from a tar)

In [3]:
# Accetto solo file con dimensione < 1MB e che finiscono per 001.pdf (si può cambiare a piacere il filtro qui)
def file_filter(f):
    return str(f).endswith(u'001.pdf') and os.path.getsize(str(f))<(1024**2)

In [4]:
# Solo se si deve partire da una tar e non si è già estratto
if extract_from_tar and not os.path.exists(tar_root[:-7]):
    ExtractNested(tar_root)
    move_flattened_files(tar_root[:-7], pdf_folder, file_filter)

###### Load models

In [5]:
# Download resources if not found
with open("gs_resource_map.json") as f:
    gs_map = json.load(f)

for k,v in gs_map.items():
    download_from_storage_if_not_present("infocamere-poc", v, k)

In [6]:
models_original = {'gensim_file':'../models/gensim_model_5000.d2v', 
                  'keras_model_file':'../models/keras_model.json',
                  'keras_weights_file':'../models/keras_weights_5000.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words.json'}

model_with_a05 = {'gensim_file':'../models/gensim_5000_model_with_verb.d2v', 
                  'keras_model_file':'../models/keras_model_retry3.json',
                  'keras_weights_file':'../models/keras_weights_verb_retry3.h5',
                  'permitted_words_file':'../dictionaries/first_5000_words_with_verb_cost.json'}

model_we = {'keras_model_file':'../models/keras_model_word_embedding.json',
            'keras_weights_file':'../models/keras_weights_word_embedding.h5',
            'reduced_dictionary_file':'../dictionaries/reduced_dictionary_cost.json'}

if use_we_model:
    loaded_models = load_models_we(**model_we)
    predict_pdfs = partial(predict_documents_pdf_we, **loaded_models)
else:
    models_demo = model_with_a05 if use_a05_model else models_original
    predict_pdfs = load_prediction_models(**models_demo)

### Prediction

In [7]:
pdf_names = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith("pdf")]
#pdf_names = ['../files_to_predict/4908476390001.pdf']
predictions, filtered_filenames = predict_pdfs(pdf_names, do_ocr=do_ocr)
#%lprun -f predict_documents_pdf_we predict_documents_pdf_we(pdf_names, do_ocr=do_ocr, **loaded_models)
#%timeit predict_pdfs(pdf_names, do_ocr=do_ocr)

#### Save predictions to csv

In [8]:
def basenames(files):
    return [os.path.basename(f) for f in files]

In [9]:
df = predictions_dataframe(basenames(pdf_names), basenames(filtered_filenames), predictions, csv_out_file)
#df.to_csv(csv_out_file,  sep=',', index=False, encoding='utf-8')

In [10]:
df.sort_values("Nome file")

Unnamed: 0,Nome file,Errore,Messaggio errore,Output rete,Predizione
11,4916506210001.pdf,No,,0.926951,COSTITUTIVO
0,4950726610001.pdf,No,,0.989937,COSTITUTIVO
14,4960295080001.pdf,No,,0.987135,COSTITUTIVO
10,4963632530001.pdf,No,,0.987968,COSTITUTIVO
1,4966811780001.pdf,No,,0.922097,COSTITUTIVO
19,4984958010001.pdf,Si,scansione,,
15,4993074850001.pdf,No,,0.965589,COSTITUTIVO
2,5005850400001.pdf,No,,0.991594,COSTITUTIVO
7,5049727800001.pdf,No,,0.988253,COSTITUTIVO
13,5061668290001.pdf,No,,0.981479,COSTITUTIVO
