In [6]:
from words import not_so_naive_split, reduce_dictionary, sentence_vector
from keras.models import model_from_json
from keras.preprocessing import sequence
from pretty_testing import predict_test
from gensim.models import Doc2Vec

import numpy as np
import textract
import json
import os

Using TensorFlow backend.


In [7]:
pdf_names = ['../atti_costitutivi/5122462300001.pdf',
             '../T3LAB-INJENIA_Analisi Preliminare.pdf',
             '../5 Planning Robot.pdf',
             '../atti_costitutivi/5122464750001.pdf']

labels_map = ['NON COSTITUTIVO', 'COSTITUTIVO']

In [8]:
gensim_model = Doc2Vec.load('gensim_model.d2v')

with open('keras_model.json') as f:
    keras_model = model_from_json(f.read())
keras_model.load_weights("keras_weights.h5")

with open('reduced_dictionary.json') as f:
    permitted_words = set(json.load(f))

In [9]:
def embed_document(model, doc, permitted_words):
    return [sentence_vector(model, sentence, permitted_words) for sentence in doc]

def predict_documents_str(txts, gensim_model, keras_model, permitted_words):
    splitted_txts = [not_so_naive_split(txt) for txt in txts]
    filtered_txts = [list(reduce_dictionary(document, permitted_words)) for document in splitted_txts]
    embedded_txts = [embed_document(gensim_model, doc, permitted_words) for doc in filtered_txts]
    padded_data = sequence.pad_sequences(embedded_txts, maxlen=200, padding="pre", truncating="post", value=0.0, dtype='float32')
    probs = keras_model.predict_proba(padded_data, verbose=0)
    return [prob[0] for prob in probs]

def predict_documents_pdf(filenames, gensim_model, keras_model, permitted_words):
    try:
        txts = [textract.process(filename) for filename in filenames]
    except:
        print "C'è un documento illeggibile..." 
        return
    
    return predict_documents_str(txts, gensim_model, keras_model, permitted_words)

def predict_documents_txt(filenames, gensim_model, keras_model, permitted_words):
    txts = [open(filename).read() for filename in filenames]
    return predict_documents_str(txts, gensim_model, keras_model, permitted_words)

In [38]:
predictions = predict_documents_pdf(pdf_names, gensim_model, keras_model, permitted_words)

In [39]:
for name, pred in zip(pdf_names, predictions):
    print os.path.basename(name)[:50].ljust(50), '\t-\t', labels_map[int(round(pred))], ' (%f)' % pred

5122462300001.pdf                                  	-	COSTITUTIVO  (0.996299)
T3LAB-INJENIA_Analisi Preliminare.pdf              	-	NON COSTITUTIVO  (0.038102)
5 Planning Robot.pdf                               	-	NON COSTITUTIVO  (0.020943)
5122464750001.pdf                                  	-	COSTITUTIVO  (0.995850)


In [10]:
txt_names = ['../ocr/scanned_non_costitutivi/3591900710001.txt']

pred2 = predict_documents_txt(txt_names, gensim_model, keras_model, permitted_words)

In [11]:
for name, pred in zip(pdf_names, pred2):
    print os.path.basename(name)[:50].ljust(50), '\t-\t', labels_map[int(round(pred))], ' (%f)' % pred

5122462300001.pdf                                  	-	NON COSTITUTIVO  (0.028954)
