In [7]:
from __future__ import print_function
from __future__ import division
from keras.models import model_from_json
from keras.preprocessing import sequence
from lib import words as wd
from lib import embedding as em
from lib import text_extraction as te
from lib import predict_pdf as pp
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import glob
import json
import re

matplotlib.style.use('ggplot')
%matplotlib inline

In [8]:
def is_scadenza(s):
    return re.match(r'.*primo\s+?esercizio.*', s) != None

def post_process_prediction(sents, y_pred, neutral, milen=10):
    y_post = list(y_pred)
    for i in range(1,len(y_pred)-1):
        y_post[i] = y_post[i+1] if y_post[i] == neutral and y_post[i-1] == y_post[i+1] and len(sents[i])>=milen else y_post[i]
    return y_post

class PartsExtraction(object):
    def __init__(self, keras_model, reduced_dict, labels, maxlen = 100):
        self._model = keras_model
        self._reduced_dict = reduced_dict
        self._labels = labels
        self._maxlen = maxlen
        
    @staticmethod
    def load_from_files(keras_model_filename, keras_weights_filename,
                        reduced_dict_filename, labels=['poteri', 'assemblea', 'clausola', 'non_riconducibile', 'scadenza']):
        with open(keras_model_filename) as f:
            km = model_from_json(f.read())
        km.load_weights(keras_weights_filename)
        with open(reduced_dict_filename) as f:
            rd = json.load(f)
        return PartsExtraction(km, rd, labels)
    
    def _int_sentences(self, sentences):
        splitted_sentences = wd.tokenize_sentences(sentences, min_words=1)
        permitted_words = self._reduced_dict.keys()
        reduced_sentences = list(em.reduce_dictionary(splitted_sentences, permitted_words, min_words=1))
        return [[self._reduced_dict[w] for w in sent] for sent in reduced_sentences]
    
    def extract_parts_prob(self, sentences):
        int_sentences = self._int_sentences(sentences)
        padded_data = sequence.pad_sequences(int_sentences, self._maxlen, padding="pre", truncating="post", value=0, dtype='uint32')
        return self._model.predict(padded_data)
    
    def extract_parts(self, sentences, post_process=False, probas = []):
        if len(probas)==0:
            probas = self.extract_parts_prob(sentences) 
        predictions = probas.argmax(axis=-1)
        for i in range(len(predictions)):
            if is_scadenza(sentences[i]):
                predictions[i] = self._labels.index('scadenza')
                break
        if post_process:
            preds = post_process_prediction(sentences, predictions, self._labels.index('non_riconducibile'))
        else:
            preds = predictions
        return [self._labels[i] for i in preds]
    
    def extract_parts_dict(self, sentences, predictions=None):
        predictions = self.extract_parts(sentences) if predictions == None else predictions
        df = pd.DataFrame({'sentence':sentences,'prediction':predictions})
        pivoted = df.pivot(columns='prediction', values='sentence')
        return {k:list(filter(None, pivoted[k])) for k in labels}
    
    def extract_parts_dict_indexes(self, predictions):
        df = pd.DataFrame({'sentence':list(range(len(predictions))),'prediction':predictions})
        pivoted = df.pivot(columns='prediction', values='sentence')
        return {k:[int(i) for i in filter(lambda x: x==x, pivoted[k])] for k in labels} #nan != nan 
    
def is_valid_nl(txt, threshold=0.075):
    return txt.count('\n')/len(txt)<=threshold

def labels_probas_dict(labels, p):
    return {l:pr for l,pr in zip(labels, p)}

def sentences_probas_dict(sentences, probas):
    return [{'frase':s,'prob':labels_probas_dict(labels[:-1], p)} for s,p in zip(sentences, probas)]

def sentences_probas(sentences, probas):
    return [{'frase':s,'prob':list(p)} for s,p in zip(sentences, probas)]

In [3]:
labels = ['poteri', 'assemblea', 'clausola', 'non_riconducibile', 'scadenza']

pe = PartsExtraction.load_from_files('models/extraction_model_30_all.json',
                                     'models/extraction_weights_30_all.h5',
                                     'first_5000_words_extraction.json')

In [4]:
document = '../extraction/esempi_descrizioni/5115612230001.pdf'#'../extraction/files_to_label/4907913200001.pdf'

txt = te.extract_text(document, do_ocr=False, pages=-1)
sentences = wd.sentences_doc(txt, rep=' ', newline=True)

In [None]:
predictions = pe.extract_parts(sentences, post_process=False)
list(enumerate(predictions))

In [None]:
predictions = pe.extract_parts(sentences, post_process=True)
list(enumerate(predictions))

In [None]:
for i,(p,s) in enumerate(zip(predictions, sentences)):
    print('\n[{}] PREDICTION: {}\n'.format(i, p))
    print(s.strip())
    

In [None]:
#df = pd.DataFrame({'sentence':sentences,'prediction':predictions})
#pivoted = df.pivot(columns='prediction', values='sentence')
#d = {k:list(filter(None, pivoted[k])) for k in labels if k != 'non_riconducibile'}

d = pe.extract_parts_dict(sentences, predictions)

In [None]:
for k,v in d.items():
    if k != 'non_riconducibile':
        print('\n\n'+k.upper().center(127,'.')+'\n')
        print('.\n\n'.join([s.strip() for s in v]))

In [None]:
probas = pe.extract_parts_prob(sentences)
predictions = pe.extract_parts(sentences, post_process=True, probas=probas)
dict_indexes = pe.extract_parts_dict_indexes(predictions)

In [None]:
dict_sentences = {k:[sentences[i] for i in dict_indexes[k]] for k in dict_indexes.keys()}
dict_indexes


In [None]:
for k,v in dict_sentences.items():
    if k != 'non_riconducibile':
        print('\n\n'+k.upper().center(127,'.')+'\n')
        print('.\n\n'.join([s.strip() for s in v]))

## Threshold sensato

In [None]:
def get_thres_nl(txt):
    t = 1
    while is_valid_nl(txt, t):
        t -= 0.001
    return t

In [None]:
txt_brutto = open('../extraction/esempi_extracted/5116982390001.txt').read()
print(get_thres_nl(txt_brutto))

In [None]:
for i,f in enumerate(glob.glob('../atti_costitutivi_txt/*')[:100]):
    text = open(f).read()
    print(f, get_thres_nl(text))

In [None]:
thress = [get_thres_nl(open(f).read()) for f in  glob.glob('../atti_costitutivi_txt/*')]

In [None]:
plt.hist(thress, bins=50)

In [None]:
for e in pe.extract_parts_prob(sentences):
    print('{:4f}\t{:4f}\t{:4f}\t{:4f}'.format(*e))

In [None]:
sentences_probas(sentences, probas)

In [None]:
pe.extract_parts_prob(sentences)

In [9]:
models = pp.load_models()

In [10]:
pp.predict_document_str(txt, **models)

0.99852413