In [1]:
from __future__ import print_function
from __future__ import division
from keras.utils import plot_model
from keras.models import Sequential, model_from_json
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

from lib.pretty_testing import predict_test, class_weights_max
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

Using TensorFlow backend.


In [2]:
top_words = 5000
embedding_vector_length = 32
maxlen = 100
epochs = 20
classes = ['poteri', 'scadenza', 'assemblea', 'clausola', 'non_riconducibile']

In [3]:
# load prepared data
with open('../extraction/dataset.p') as f:
    data, labels = pickle.load(f)

In [4]:
padded_data = sequence.pad_sequences(data, maxlen, padding="pre", truncating="post", value=0, dtype='uint32')
labels = np.array(labels)

In [5]:
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=maxlen, mask_zero=True))
model.add(LSTM(32, dropout=0.2))
model.add(Dense(len(classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 168,485
Trainable params: 168,485
Non-trainable params: 0
_________________________________________________________________


In [6]:
plot_model(model, to_file='model.png')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_data, labels, train_size=0.7, stratify=labels)
y_train_cat = to_categorical(y_train, num_classes=len(classes))
y_test_cat = to_categorical(y_test, num_classes=len(classes))

In [None]:
[l.get_weights()[0].shape for l in model.layers]

In [None]:
class_weights = class_weights_max(labels)

In [None]:
class_weights

In [None]:
h = model.fit(X_train,  y_train_cat, epochs=epochs, batch_size=32, verbose=0, class_weight=class_weights)
#h.history

In [None]:
df = pd.DataFrame(h.history)
df.plot(figsize=(16,8))

In [None]:
predict_test(model, X_test, y_test, classes)