In [14]:
# Import libraries

# Data handling
import pandas as pd
import numpy as np

# Data pre-processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras_preprocessing.sequence import pad_sequences

# Model
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf

# To save model
import joblib

In [15]:
# Load preprocessed dataset
file_path = "../data/norm_dataset.csv"
df = pd.read_csv(file_path)

In [16]:
# Preview data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,text,norm
0,"Client agrees to pay to Company the sum of $5,000 (the “Contract Price”) to design and develop a website for Client (the “Client Website”) in accordance with the accompanying Scope of Work, attached to this Agreement as Exhibit A.",1
1,Company will use its best efforts to deliver the Client Website in the time frame specified in the Scope of Work.,1
2,"All written content submitted by Client for use in the Client Website must be typewritten, proofread and delivered to Company in the body of an email message or as a Microsoft Word electronic document or plaint text electronic document.",1
3,It is Client’s sole responsibility to check the accuracy of the written content and correct any errors prior to submission for final publication.,1
4,Client further agrees that Company may use and display the graphics and other web design elements of Client’s website as examples of Company website design and development work.,1


In [17]:
# Remove duplicates
df.drop_duplicates(keep='first', inplace=True)

In [18]:
# Assign norms to target variable. Values already numeric
y = df['norm']
y

0      1
1      1
2      1
3      1
4      1
      ..
361    1
362    1
363    1
364    1
365    1
Name: norm, Length: 360, dtype: int64

In [19]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 555 and mean is 43
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.text)

# Functions to transform text to feature_vectors 
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [20]:
# Call function to create features 'X'
X = get_features(df.text)

print(X.shape, y.shape)

(360, 200) (360,)


In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [22]:
# law2vec 100 dimensional word embeddings
from numpy import array, asarray, zeros
vocab_size = len(tokenizer.word_index) + 1

embeddings_dictionary = dict()

law2vec_file = open('./Law2Vec.100d.txt', encoding="utf8")

# Parse each line and store word-vector pairs in a dictionary
for line in law2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
law2vec_file.close()

# Each row corresponds to a word with its 100 dimensional word vector
embedding_matrix = zeros((vocab_size, 100))

# tokenizer.word_index is a list of (word, id) tuples
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [23]:
# Modelling - Convolutional Neural Network

filter_length = 300
num_classes = 1 # binary problem

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
model = Sequential()
model.add(embedding_layer)
# model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, kernel_size=8, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          230300    
                                                                 
 dropout_1 (Dropout)         (None, 200, 100)          0         
                                                                 
 conv1d_1 (Conv1D)           (None, 193, 300)          240300    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 300)              0         
 balMaxPooling1D)                                                
                                                                 
 flatten_1 (Flatten)         (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 301       
                                                      

In [24]:
# Fit the model
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
]

history = model.fit(X_train, y_train,
                    epochs=20,
                    batch_size=32,
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [26]:
# Evaluation metrics
metrics = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.261433482170105
accuracy: 0.875


In [27]:
# Prediction
x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
xt = get_features(x)
prediction = model.predict(xt)
probas = (prediction > 0.5).astype(int)

if probas == [1]:
    tag = 'Norm'
else:
    tag = 'Non-norm'

print(prediction)
print(probas)
print(tag)

[[0.96828336]]
[[1]]
Norm


In [28]:
# Save tokenizer
joblib.dump(tokenizer, '../models/BinaryLabelTokenizer.pkl')

['../models/BinaryLabelTokenizer.pkl']

In [29]:
# Save the model
joblib.dump(model, '../models/BinaryLabelModel_CNN.pkl')



INFO:tensorflow:Assets written to: ram://a6abc740-0689-48d7-98fe-d7d5b2c7c885/assets


INFO:tensorflow:Assets written to: ram://a6abc740-0689-48d7-98fe-d7d5b2c7c885/assets


['../models/BinaryLabelModel_CNN.pkl']