# Demo

###### Imports

In [13]:
# This Python file uses the following encoding: utf-8
from lib.words import reduce_dictionary, sentence_vector, tokenize_doc
from keras.models import model_from_json
from keras.preprocessing import sequence
from lib.untar import ExtractNested
from gensim.models import Doc2Vec
import lib.text_extraction as te
from functools import partial
from collections import OrderedDict
import pandas as pd
import json
import os

###### Functions

In [14]:
# Accetto solo file con dimensione < 1MB
def file_filter(f):
    return os.path.getsize(str(f.absolute()))<(1024**2)
    #return f.name.endswith(u'001.pdf') and os.path.getsize(str(f.absolute()))<(1024**2*5)

def embed_document(model, doc, permitted_words):
    return [sentence_vector(model, sentence, permitted_words) for sentence in doc]

def predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words):
    filtered_filenames = [f for f,t in zip(filenames, txts) if (t != None and len(t)>0)]
    not_empty_txts =  [t for t in txts if  (t != None and len(t)>0)]
    
    splitted_txts = (tokenize_doc(txt) for txt in not_empty_txts) 
    filtered_txts = (list(reduce_dictionary(document, permitted_words)) for document in splitted_txts)
    embedded_txts = [embed_document(gensim_model, doc, permitted_words) for doc in filtered_txts]
    padded_data = sequence.pad_sequences(embedded_txts, maxlen=200, padding="pre", truncating="post", value=0.0, dtype='float32')
    probs = keras_model.predict_proba(padded_data, verbose=0)
    return [prob[0] for prob in probs], filtered_filenames

def predict_documents_pdf(filenames, gensim_model, keras_model, permitted_words, do_ocr=False):
    txts = [te.extract_text(filename, do_ocr) for filename in filenames]
    return predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words)

def predict_documents_txt(filenames, gensim_model, keras_model, permitted_words):
    txts = [open(filename).read() for filename in filenames]
    return predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words)

###### Loading models

In [3]:
models = {}
models['gensim_model'] = Doc2Vec.load('models/gensim_model_5000.d2v')

with open('models/keras_model.json') as f:
    models['keras_model'] = model_from_json(f.read())
models['keras_model'].load_weights("models/keras_weights_5000.h5")

with open('first_5000_words.json') as f:
    models['permitted_words'] = set(json.load(f))
    
predict_pdfs = partial(predict_documents_pdf, **models)

###### Input arguments

In [15]:
pdf_folder = '../files_to_predict'
csv_out_file = '../predictions2.csv'

labels_map = ['NON COSTITUTIVO', 'COSTITUTIVO']

###### Tar extraction (execute only the first time if starting from a tar)

In [4]:
# Solo se si deve partire da una tar
tar_root = '../prova.tar.gz'
ExtractNested(tar_root)

In [12]:
move_flattened_files(tar_root[:-7], pdf_folder, file_filter)

### Prediction

In [16]:
pdf_names = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder)]

In [17]:
predictions, filtered_filenames = predict_pdfs(pdf_names)

../files_to_predict/4908476390001.pdf è una scansione
../files_to_predict/4908921960001.pdf è una scansione
../files_to_predict/4909311650001.pdf è una scansione
../files_to_predict/4908983690001.pdf è una scansione
../files_to_predict/4138282530001.pdf è una scansione
../files_to_predict/4137960290001.pdf è una scansione
../files_to_predict/4908312250001.pdf è una scansione
../files_to_predict/4907849390001.pdf è una scansione
../files_to_predict/4138456320001.pdf è una scansione
../files_to_predict/4137265570001.pdf è una scansione
../files_to_predict/4908190400001.pdf è una scansione
../files_to_predict/4908086720001.pdf è una scansione
../files_to_predict/4908717640001.pdf è una scansione
../files_to_predict/4137094820001.pdf è una scansione
../files_to_predict/4908064870001.pdf è una scansione
../files_to_predict/4138180200001.pdf è una scansione
../files_to_predict/4136985860001.pdf è una scansione
../files_to_predict/4138333080001.pdf è una scansione
../files_to_predict/41383147

#### Show predictions

In [18]:
for name, pred in zip(filtered_filenames, predictions):
    print os.path.basename(name)[:50].ljust(50), '\t-\t', labels_map[int(round(pred))], ' (%f)' % pred

T3LAB-INJENIA_Analisi Preliminare.pdf              	-	NON COSTITUTIVO  (0.108916)
5 Planning Robot.pdf                               	-	NON COSTITUTIVO  (0.017676)
5122462300001.pdf                                  	-	COSTITUTIVO  (0.995220)


#### Save predictions to csv

In [19]:
def basenames(files):
    return [os.path.basename(f) for f in files]

def predictions_dataframe(pdf_names, filtered_filenames, predictions, csv_out_file):
    filt_filenames_set = set(filtered_filenames)
    labels = [labels_map[int(round(pred))] for pred in predictions]
    not_predicted_pdfs = [pdf for pdf in pdf_names if pdf not in filt_filenames_set]
    err_fill = [u'']*len(not_predicted_pdfs)
    pred_fill = [u'']*len(filtered_filenames)
    err_msgs = [u'scansione']*len(not_predicted_pdfs)
    
    df_dict = OrderedDict([('Nome file', filtered_filenames+not_predicted_pdfs),
                          ('Errore', ['No']*len(filtered_filenames) + ['Si']*len(not_predicted_pdfs)),
                          ('Messaggio errore', pred_fill + err_msgs),
                          ('Output rete', predictions + err_fill),
                          ('Predizione', labels + err_fill),])
    
    df = pd.DataFrame(df_dict)
    
    return df

In [20]:
df = predictions_dataframe(basenames(pdf_names), basenames(filtered_filenames), predictions, csv_out_file)
df.to_csv(csv_out_file,  sep=',', index=False, encoding='utf-8')

In [21]:
df

Unnamed: 0,Nome file,Errore,Messaggio errore,Output rete,Predizione
0,T3LAB-INJENIA_Analisi Preliminare.pdf,No,,0.108916,NON COSTITUTIVO
1,5 Planning Robot.pdf,No,,0.0176762,NON COSTITUTIVO
2,5122462300001.pdf,No,,0.99522,COSTITUTIVO
3,4908476390001.pdf,Si,scansione,,
4,4908921960001.pdf,Si,scansione,,
5,4909311650001.pdf,Si,scansione,,
6,4908983690001.pdf,Si,scansione,,
7,4138282530001.pdf,Si,scansione,,
8,4137960290001.pdf,Si,scansione,,
9,4908312250001.pdf,Si,scansione,,
