In [35]:
# Import libraries

# Data handling
import pandas as pd
import numpy as np

# Data pre-processing
from ast import literal_eval
from numpy import array
from numpy import asarray
from numpy import zeros
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras_preprocessing.sequence import pad_sequences

# Model
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss, average_precision_score

# To save model
import joblib

In [36]:
# Load preprocessed dataset
file_path = "../data/preprocessed_data.csv"
df = pd.read_csv(file_path)

In [37]:
# Preview data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [38]:
# Convert tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

In [39]:
# Encode tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [40]:
# Define X and y
X = list(df.sentence)
y = multilabel.transform(df.tag)

In [41]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [42]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 691 and mean is 52; however, 691 is an outlier
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

# Padding - sequences with word count less than 200 are added
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [43]:
X_train.shape, y_train.shape

((757, 200), (757, 3))

In [44]:
# Create word embeddings using law2vec
embeddings_dictionary = dict()

law2vec_file = open('./Law2Vec.100d.txt', encoding="utf8")

# Parse each line and store word-vector pairs in a dictionary
for line in law2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
law2vec_file.close()

# Each row corresponds to a word with its 100-d word vector
embedding_matrix = zeros((vocab_size, 100))

# tokenizer.word_index is a list of (word, id) tuples
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [45]:
# Build RNN model with 128 LSTM units
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False, mask_zero=True)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(3, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['binary_accuracy'])

model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 200, 100)          271900    
                                                                 
 lstm_4 (LSTM)               (None, 128)               117248    
                                                                 
 dense_4 (Dense)             (None, 3)                 387       
                                                                 
Total params: 389,535
Trainable params: 117,635
Non-trainable params: 271,900
_________________________________________________________________


In [46]:
# Fit the model
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4)
]

history = model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=20,
                    validation_split=0.2,
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [47]:
# Evaluation
metrics = model.evaluate(X_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.5916264057159424
binary_accuracy: 0.6649122834205627


In [48]:
# Loss and precision score
y_pred = model.predict(X_test)
print("LRAP: {:.2}".format(label_ranking_average_precision_score(y_test,y_pred)))
print("Ranking Loss: {:.2}".format(label_ranking_loss(y_test,y_pred)))
print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))

LRAP: 0.76
Ranking Loss: 0.31
Precision Score: 0.63


In [33]:
# Prediction
# x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
# prediction = model.predict(x)
# # probas = np.array(prediction)
# # labels = (probas > 0.5).astype(np.int)

# # tags = multilabel.inverse_transform(labels)

# print(prediction)
# print(labels)
# print(tags)

In [34]:
# Save the model
joblib.dump(model, '../models/MultiLabelModel_LSTM.pkl')



INFO:tensorflow:Assets written to: ram://14e11479-1535-4174-b1d6-03d8781bc0a7/assets


INFO:tensorflow:Assets written to: ram://14e11479-1535-4174-b1d6-03d8781bc0a7/assets


['../models/MultiLabelModel_LSTM.pkl']