In [1]:
# Import libraries

# Data handling
import pandas as pd
import numpy as np

# Data pre-processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras_preprocessing.sequence import pad_sequences

# Model
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf

# To save model
import joblib

In [2]:
# Load preprocessed dataset
file_path = "../data/norm_dataset.csv"
df = pd.read_csv(file_path)

In [3]:
# Preview data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,text,norm
0,"Client agrees to pay to Company the sum of $5,000 (the “Contract Price”) to design and develop a website for Client (the “Client Website”) in accordance with the accompanying Scope of Work, attached to this Agreement as Exhibit A.",1
1,Company will use its best efforts to deliver the Client Website in the time frame specified in the Scope of Work.,1
2,"All written content submitted by Client for use in the Client Website must be typewritten, proofread and delivered to Company in the body of an email message or as a Microsoft Word electronic document or plaint text electronic document.",1
3,It is Client’s sole responsibility to check the accuracy of the written content and correct any errors prior to submission for final publication.,1
4,Client further agrees that Company may use and display the graphics and other web design elements of Client’s website as examples of Company website design and development work.,1


In [4]:
# Remove duplicates
df.drop_duplicates(keep='first', inplace=True)

In [5]:
# Assign norms to target variable. Values already numeric
y = df['norm']
y

0      1
1      1
2      1
3      1
4      1
      ..
361    1
362    1
363    1
364    1
365    1
Name: norm, Length: 360, dtype: int64

In [6]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 555 and mean is 43
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.text)

# Functions to transform text to feature_vectors 
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [7]:
# Call function to create features 'X'
X = get_features(df.text)

print(X.shape, y.shape)

(360, 200) (360,)


In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [9]:
# law2vec 100 dimensional word embeddings
from numpy import array, asarray, zeros
vocab_size = len(tokenizer.word_index) + 1

embeddings_dictionary = dict()

law2vec_file = open('./Law2Vec.100d.txt', encoding="utf8")

# Parse each line and store word-vector pairs in a dictionary
for line in law2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
law2vec_file.close()

# Each row corresponds to a word with its 100 dimensional word vector
embedding_matrix = zeros((vocab_size, 100))

# tokenizer.word_index is a list of (word, id) tuples
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [10]:
from sklearn.model_selection import KFold

In [13]:
# Convert X and y to NumPy arrays if they are not already
X = np.array(X)
y = np.array(y)

# Define the number of folds for cross-validation
k = 5

# Create a KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=0)

# Initialize lists to store evaluation metrics for each fold
accuracy_scores = []
loss_scores = []
train_losses = []
val_losses = []

# Define callbacks
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
]

# Iterate over the folds
for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    print(f"Fold {fold}")

    # Split the data into training and validation sets for the current fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Build and compile the model (same code as before)
    filter_length = 300
    num_classes = 1 
    embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.1))
    model.add(Conv1D(filter_length, kernel_size=8, activation='relu'))
    model.add(GlobalMaxPool1D())
    model.add(Flatten())
    model.add(Dense(num_classes))
    model.add(Activation('sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Fit the model on the training data for the current fold
    history = model.fit(X_train_fold, y_train_fold,
                        epochs=20,
                        batch_size=32,
                        callbacks=callbacks,
                        verbose=0)
    
    # Evaluate the model on the validation data for the current fold
    #_, accuracy = model.evaluate(X_val_fold, y_val_fold)
    #loss, _ = model.evaluate(X_val_fold, y_val_fold)
    train_loss, train_accuracy = model.evaluate(X_train_fold, y_train_fold, verbose=0)
    train_losses.append(train_loss)

    # Evaluate model on the validation data for the current fold
    val_loss, val_accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    val_losses.append(val_loss)

    # Append the evaluation scores to the lists
    accuracy_scores.append(val_accuracy)
    loss_scores.append(val_loss)
    print()

# Print the training and validation loss for eadh fold
for fold in range(k):
    print(f"Fold {fold+1} - Train Loss: {train_losses[fold]:.4f} - Val Loss: {val_losses[fold]:.4f}")

# Calculate and print the mean evaluation scores across all folds
print("Mean Accuracy:", np.mean(accuracy_scores))
print("Mean Loss:", np.mean(loss_scores))

# After the cross-validation, proceed with testing on the separate test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)
print("Test Loss:", test_loss)


Fold 1



Fold 2

Fold 3

Fold 4

Fold 5

Fold 1 - Train Loss: 0.0110 - Val Loss: 0.2737
Fold 2 - Train Loss: 0.0132 - Val Loss: 0.1912
Fold 3 - Train Loss: 0.0063 - Val Loss: 0.3238
Fold 4 - Train Loss: 0.0086 - Val Loss: 0.3611
Fold 5 - Train Loss: 0.0077 - Val Loss: 0.2189
Mean Accuracy: 0.8777777791023255
Mean Loss: 0.27372720241546633
Test Accuracy: 0.9861111044883728
Test Loss: 0.06295739114284515


In [14]:
# Evaluation metrics
metrics = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.06295739114284515
accuracy: 0.9861111044883728


In [15]:
# Calculate precision, recall, and F1 scores
from sklearn.metrics import precision_score, recall_score, f1_score

# Convert y_pred to binary labels using a threshold of 0.5
binary_pred = (y_pred > 0.5).astype(int)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, binary_pred)
recall = recall_score(y_test, binary_pred)
f1 = f1_score(y_test, binary_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 1.0
Recall: 0.9714285714285714
F1 Score: 0.9855072463768115


In [16]:
# Prediction
x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
xt = get_features(x)
prediction = model.predict(xt)
probas = (prediction > 0.5).astype(int)

if probas == [1]:
    tag = 'Norm'
else:
    tag = 'Non-norm'

print(prediction)
print(probas)
print(tag)

[[0.97574955]]
[[1]]
Norm


In [17]:
# Save tokenizer
joblib.dump(tokenizer, '../models/BinaryLabelTokenizer.pkl')

['../models/BinaryLabelTokenizer.pkl']

In [18]:
# Save the model
joblib.dump(model, '../models/BinaryLabelModel_CNN.pkl')



INFO:tensorflow:Assets written to: ram://dd85d2e3-4917-46a2-98bb-14622dcf4f70/assets


INFO:tensorflow:Assets written to: ram://dd85d2e3-4917-46a2-98bb-14622dcf4f70/assets


['../models/BinaryLabelModel_CNN.pkl']