#### Just for experimentation, use train_lstm.py!!!

In [11]:
# This Python file uses the following encoding: utf-8
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.preprocessing import sequence
from keras.optimizers import Adam, RMSprop
from sklearn.model_selection import train_test_split

from lib.pretty_testing import predict_test

import numpy as np
import pickle

In [4]:
dataset_file = 'embedded_docs_with_verb_retry.p'
model_file = 'models/keras_model_retry.json'
model_weights_file = 'models/keras_weights_verb_retry.h5'

In [5]:
# fix random seed for reproducibility
np.random.seed(123)

# load prepared data
with open(dataset_file) as f:
    data, labels = pickle.load(f)

In [6]:
# padding for the rnn
padded_data = sequence.pad_sequences(data, maxlen=200,padding="pre", truncating="post", value=0.0, dtype='float32')

In [7]:
# free ram from the original dataset
del data

In [8]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(padded_data, labels, train_size=0.9, stratify=labels)
del padded_data



In [14]:
'''
model = Sequential()
model.add(LSTM(100, input_shape = (200, 100), return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

'''
model = Sequential()
model.add(LSTM(100, input_shape = (200, 100)))
#model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 80,501
Trainable params: 80,501
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
with open(model_file,'w') as f:
    f.write(model.to_json())

In [None]:
model.fit(X_train, y_train, epochs=3, batch_size=64, )
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3

In [18]:
# serialize weights to HDF5
model.save_weights(model_weights_file)

### Test loading

In [None]:
with open('models/keras_model.json') as f:
    model = model_from_json(f.read())
model.summary()

In [None]:
model.load_weights("models/keras_new_weights_with_verb_es.h5")

In [19]:
predict_test(model, X_test, y_test, ["non_cost", "cost"])

()
Test classification report
Accuracy: 0.827423
             precision    recall  f1-score   support

          0       0.95      0.69      0.80      1904
          1       0.76      0.96      0.85      1903

avg / total       0.85      0.83      0.82      3807

Test confusion Matrix
             non_cost     cost
    non_cost   1319.0    585.0
        cost     72.0   1831.0


array([[1],
       [1],
       [1],
       ..., 
       [1],
       [1],
       [1]], dtype=int32)

# Test

In [20]:
# This Python file uses the following encoding: utf-8
from lib.words import tokenize_doc
from lib.embedding import reduce_dictionary, sentence_vector
from gensim.models import Doc2Vec
import lib.text_extraction as te
from functools import partial

import codecs
import json
import os
import glob
import sys

In [21]:
labels_map = ['NON COSTITUTIVO', 'COSTITUTIVO']

pdf_folder = '../files_to_predict'
pdf_names = glob.glob('../files_to_predict/*')

# For OCR...
png_dir = '../tmp'
min_words = 150
pages = 5

In [22]:
# Accetto solo file con dimensione < 5MB
def file_filter(f):
    return os.path.getsize(str(f.absolute()))<(1024**2*5)
    #return f.name.endswith(u'001.pdf') and os.path.getsize(str(f.absolute()))<(1024**2*5)

def embed_document(model, doc, permitted_words):
    return [sentence_vector(model, sentence, permitted_words) for sentence in doc]

def predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words):
    filtered_filenames = [f for f,t in zip(filenames, txts) if (t != None and len(t)>0)]
    not_empty_txts =  [t for t in txts if (t != None and len(t)>0)]
    
    splitted_txts = [tokenize_doc(txt) for txt in not_empty_txts] 
    filtered_txts = [list(reduce_dictionary(document, permitted_words)) for document in splitted_txts]
    embedded_txts = [embed_document(gensim_model, doc, permitted_words) for doc in filtered_txts]
    padded_data = sequence.pad_sequences(embedded_txts, maxlen=200, padding="pre", truncating="post", value=0.0, dtype='float32')
    probs = keras_model.predict_proba(padded_data, verbose=0)
    return [prob[0] for prob in probs], filtered_filenames

def predict_documents_pdf(filenames, gensim_model, keras_model, permitted_words, do_ocr=False):
    txts = [te.extract_text(filename, do_ocr, png_dir, min_words, pages) for filename in filenames]
    return predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words)

def predict_documents_txt(filenames, gensim_model, keras_model, permitted_words):
    txts = [open(filename).read() for filename in filenames]
    return predict_documents_str(filenames, txts, gensim_model, keras_model, permitted_words)

In [23]:
models = {}
models['gensim_model'] = Doc2Vec.load('models/gensim_5000_model_with_verb.d2v')
models['keras_model'] = model

with open('first_5000_words_with_verb_cost.json') as f:
    models['permitted_words'] = set(json.load(f))
    
#predict_pdfs = partial(predict_documents_pdf, gensim_model=gensim_model, keras_model=keras_model, permitted_words=permitted_words)
predict_pdfs = partial(predict_documents_pdf, **models)

In [24]:
predictions, filtered_filenames = predict_pdfs(pdf_names)

In [25]:
for name, pred in zip(filtered_filenames, predictions):
    print os.path.basename(name)[:50].ljust(50), '\t-\t', labels_map[int(round(pred))], ' (%f)' % pred

3297438520001.pdf                                  	-	NON COSTITUTIVO  (0.144382)
4908019380001.pdf                                  	-	COSTITUTIVO  (0.887578)
3297448050002.pdf                                  	-	NON COSTITUTIVO  (0.067564)
4907876150001.pdf                                  	-	COSTITUTIVO  (0.891512)
4907903520001.pdf                                  	-	COSTITUTIVO  (0.888871)
4907874650001.pdf                                  	-	NON COSTITUTIVO  (0.078098)
3296082450001.pdf                                  	-	NON COSTITUTIVO  (0.128675)
4908013560001.pdf                                  	-	NON COSTITUTIVO  (0.090709)
4907882890001.pdf                                  	-	NON COSTITUTIVO  (0.096234)
4907924550001.pdf                                  	-	NON COSTITUTIVO  (0.120383)
4910203190001.pdf                                  	-	NON COSTITUTIVO  (0.123448)
4907941200001.pdf                                  	-	NON COSTITUTIVO  (0.066273)
4907932630001.pdf           

In [None]:
res = []
with open("../atti_costitutivi_sentences/4974867770001.txt") as f:
    a = f.read()
    print a

In [None]:
for w in a:
    num = ord(w)
    if num != 32:
        num = num - 3
    res.append(chr(num))
print ''.join(res)        