In [1]:
# Import libraries

# Data handling
import pandas as pd
import numpy as np

# Data pre-processing
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras_preprocessing.sequence import pad_sequences

# Model
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss, average_precision_score

# To save model
import joblib

In [2]:
# Load preprocessed dataset
file_path = "../data/preprocessed_data.csv"
df = pd.read_csv(file_path)

In [3]:
# Preview data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [4]:
# Convert tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

In [5]:
# Encode tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [6]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 691 and mean is 52; however, 691 is an outlier
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.sentence)

# Functions to transform text to feature_vectors 
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [7]:
# Call function to create features 'X'
X = get_features(df.sentence)

# Transform y
y = multilabel.transform(df.tag)

print(X.shape, y.shape)

(947, 200) (947, 3)


In [8]:
X

array([[  0,   0,   0, ...,   8, 577, 372],
       [  0,   0,   0, ..., 105, 106, 109],
       [  0,   0,   0, ...,  19,   1,  31],
       ...,
       [  0,   0,   0, ...,  14,  11,  37],
       [  0,   0,   0, ...,   1,  12,   9],
       [  0,   0,   0, ..., 276,   5, 238]], dtype=int32)

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [24]:
from sklearn.model_selection import KFold

In [11]:
filter_length = 300
num_classes = 3

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

2023-07-13 22:12:22.689048: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 20)           40000     
                                                                 
 dropout (Dropout)           (None, 200, 20)           0         
                                                                 
 conv1d (Conv1D)             (None, 198, 300)          18300     
                                                                 
 global_max_pooling1d (Globa  (None, 300)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 3)                 903       
                                                                 
 activation (Activation)     (None, 3)                 0         
                                                        

In [15]:
# Define the callbacks
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4)
]

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True)
train_loss = []
val_loss = []

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

    history = model.fit(
        X_fold_train,
        y_fold_train,
        epochs=20,
        batch_size=32,
        callbacks=callbacks,
        validation_data=(X_fold_val, y_fold_val)
    )

    train_loss.append(history.history['loss'][-1])
    val_loss.append(history.history['val_loss'][-1])

# Print the training and validation loss for each fold
for fold in range(5):
    print("Fold %d - Train Loss: %.4f - Val Loss: %.4f" % (fold+1, train_loss[fold], val_loss[fold]))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Fold 1 - Train Loss: 0.0349 - Val Loss: 0.2009
Fold 2 - Train Loss: 0.0299 - Val Loss: 0.0230
Fold 3 - Train Loss: 0.0174 - Val Loss: 0.0139
Fold 4 - Train Loss: 0.0051 - Val Loss: 0.0107
Fold 5 - Train Loss: 0.0033 - Val Loss: 0.0021


In [16]:
# Evaluation metrics
metrics = model.evaluate(X_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.1860850751399994
categorical_accuracy: 0.8736842274665833


In [17]:
# Calculating loss and precision
y_pred = model.predict(X_test)
print("LRAP: {:.2}".format(label_ranking_average_precision_score(y_test,y_pred)))
print("Ranking Loss: {:.2}".format(label_ranking_loss(y_test,y_pred)))
print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))

LRAP: 0.99
Ranking Loss: 0.018
Precision Score: 0.98


In [18]:
# Prediction
# x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
x = ["The confidentiality obligations contained in this section XI shall not apply to the extent that the receiving Party (the 'Recipient') is required (a) to disclose information by law, order or regulation of a governmental agency or a court of competent jurisdiction , or (b) to disclose information to any governmental agency for purposes of obtaining approval to test or market a Product , provided in either case that the Recipient shall provide written notice thereof to the other Party and sufficient opportunity to object to any such disclosure or to request confidential treatment thereof."]
xt = get_features(x)
prediction = model.predict(xt)
# probas = np.array(prediction)
# labels = (probas > 0.5).astype(np.int)

probas = (prediction > 0.5).astype(int)
tags = multilabel.inverse_transform(probas)
# tags = multilabel.inverse_transform(labels)

print(prediction)
# print(labels)
print(tags)



[[9.7268796e-01 2.9078115e-05 9.9896812e-01]]
[('obligation', 'prohibition')]


In [19]:
# print((tup[0] for tup in tags))
# print('\n'.join([tup[0] for tup in tags]))
# print('\n'.join([tup[0] for tup in tags]))
print('\n'.join(tags[0]).upper())


OBLIGATION
PROHIBITION


In [20]:
print(*tags[0], sep=' ')

obligation prohibition


In [21]:
# Save tokenizer
joblib.dump(tokenizer, '../models/MultiLabelTokenizer.pkl')

['../models/MultiLabelTokenizer.pkl']

In [22]:
# Save binarizer
joblib.dump(multilabel, '../models/MultiLabelBinarizer_CNN.pkl')

['../models/MultiLabelBinarizer_CNN.pkl']

In [23]:
# Save the model
joblib.dump(model, '../models/MultiLabelModel_CNN.pkl')



INFO:tensorflow:Assets written to: ram://19351dd7-3171-4cf7-a044-2bb89389cfbf/assets


INFO:tensorflow:Assets written to: ram://19351dd7-3171-4cf7-a044-2bb89389cfbf/assets


['../models/MultiLabelModel_CNN.pkl']