In [23]:
import jsonlines
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SimpleRNN, LSTM, GRU, Dense, Dropout
from sklearn.metrics import f1_score, accuracy_score

### Baseline - 3 : Deep Learning Models

For the Deep Learning baseline, we have implemented and tested the dataset on 3 different types of models : RNNs, LSTMs and GRUs. Taking inspiration from the ACL 2020 Diplomacy paper which used LSTM, we extended the approach by using other DL models as well. The main concept is that we need some form of memory for the model to perform in a good manner. We have used the same evaluation metrics : Accuracy, Macro F1 and Lie F1 score as given the paper so that we can have a proper comparison. 

In [24]:
class Preprocessing_class:
    def __init__(self, file_path, label_type="sender", num_words=10000, max_len=100):
        self.file_path = file_path
        self.label_type = label_type.lower() 
        self.num_words = num_words
        self.max_len = max_len
        self.data = None
        self.aggregated_messages = None
        self.tokenizer = Tokenizer(num_words=self.num_words, oov_token="<OOV>")
    
    def load_data(self):
        with jsonlines.open(self.file_path, 'r') as reader:
            self.data = list(reader)
        return self.data
    
    def process_dialog(self, dialog):
        messages = dialog.get('messages', [])
        senders = dialog.get('sender_labels', [])
        receivers = dialog.get('receiver_labels', [])
        return [{'message': msg, 'sender': senders[i], 'receiver': receivers[i]}
            for i, msg in enumerate(messages)]
    
    def aggregate_dialogs(self):
        if self.data is None:self.load_data()
        return [msg for dialog in self.data for msg in self.process_dialog(dialog)]
    
    def aggregate_data(self):
        self.aggregated_messages = self.aggregate_dialogs()
        return self.aggregated_messages
    
    def to_bool(self, label):
        return label if isinstance(label, bool) else (True if label.lower() == 'true' else False)
    
    def is_valid_label(self, label):
        return label in {True, False, 'true', 'false'}
    
    def filter_message(self, msg):
        if self.is_valid_label(msg.get('receiver')):
            return {'message': msg['message'], 'sender': self.to_bool(msg['sender']), 'receiver': self.to_bool(msg['receiver'])}
        return None
    
    def filter_valid_messages(self):
        if self.aggregated_messages is None:
            self.aggregate_data()
        return [filtered for msg in self.aggregated_messages 
                if (filtered := self.filter_message(msg)) is not None]
    
    def get_text_and_labels(self):
        valid_msgs = self.filter_valid_messages()
        texts = [msg['message'] for msg in valid_msgs]
        get_label = lambda msg: msg['sender'] if self.label_type == "sender" else msg['receiver']
        labels = [0 if get_label(msg) else 1 for msg in valid_msgs]
        return texts, labels

    def tokenize_and_pad(self, texts):
        self.tokenizer.fit_on_texts(texts)
        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=self.max_len, padding='post', truncating='post')
        return padded

In [25]:
class DL_model_class:
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test, vocab_size, max_len):
        self.X_train = X_train
        self.y_train = np.array(y_train)
        self.X_val = X_val
        self.y_val = np.array(y_val)
        self.X_test = X_test
        self.y_test = np.array(y_test)
        self.vocab_size = vocab_size
        self.max_len = max_len

    def build_model(self, cell_type="RNN"):
        model = Sequential()
        model.add(Embedding(self.vocab_size, 64, input_length=self.max_len))
        if cell_type == "RNN":
            model.add(Bidirectional(SimpleRNN(64, return_sequences=False)))
        elif cell_type == "LSTM":
            model.add(Bidirectional(LSTM(64, return_sequences=False)))
        elif cell_type == "GRU":
            model.add(Bidirectional(GRU(64, return_sequences=False)))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def run_model(self, cell_type="RNN", epochs=5, batch_size=32):
        model = self.build_model(cell_type)
        print(f"\nTraining {cell_type} model...")
        model.fit(self.X_train, self.y_train, epochs=epochs, batch_size=batch_size,
                  validation_data=(self.X_val, self.y_val), verbose=2)
        print(f"Evaluating {cell_type} model on test set...")
        loss, accuracy = model.evaluate(self.X_test, self.y_test, verbose=0)
        preds = (model.predict(self.X_test) > 0.5).astype("int32")
        macro_f1 = f1_score(self.y_test, preds, average='macro', zero_division=0)
        lie_f1 = f1_score(self.y_test, preds, pos_label=1, average='binary', zero_division=0)
        print(f"{cell_type} - Test Accuracy: {round(accuracy,3)}, Macro F1: {round(macro_f1,3)}, Lie F1: {round(lie_f1,3)}")
        print("-" * 50)

    def run_birnn(self, epochs=5, batch_size=32):
        self.run_model(cell_type="RNN", epochs=epochs, batch_size=batch_size)

    def run_bilstm(self, epochs=5, batch_size=32):
        self.run_model(cell_type="LSTM", epochs=epochs, batch_size=batch_size)

    def run_bigru(self, epochs=5, batch_size=32):
        self.run_model(cell_type="GRU", epochs=epochs, batch_size=batch_size)

In [26]:
train_file = '/Users/varun/Desktop/College/sem6/NLP/Group Project/Data/train.jsonl'
val_file = '/Users/varun/Desktop/College/sem6/NLP/Group Project/Data/validation.jsonl'
test_file = '/Users/varun/Desktop/College/sem6/NLP/Group Project/Data/test.jsonl'

train_set = Preprocessing_class(train_file, label_type="sender", num_words=10000, max_len=100)
train_set.load_data()
train_set.aggregate_data()
train_texts, train_labels = train_set.get_text_and_labels()
X_train = train_set.tokenize_and_pad(train_texts)

val_set = Preprocessing_class(val_file, label_type="sender", num_words=10000, max_len=100)
val_set.load_data()
val_set.aggregate_data()
val_texts, val_labels = val_set.get_text_and_labels()
X_val = pad_sequences(train_set.tokenizer.texts_to_sequences(val_texts),maxlen=train_set.max_len, padding='post', truncating='post')

test_class = Preprocessing_class(test_file, label_type="sender", num_words=10000, max_len=100)
test_class.load_data()
test_class.aggregate_data()
test_texts, test_labels = test_class.get_text_and_labels()
X_test = pad_sequences(train_set.tokenizer.texts_to_sequences(test_texts),maxlen=train_set.max_len, padding='post', truncating='post')

vocab_size = train_set.num_words

baseline_3 = DL_model_class(X_train, train_labels, X_val, val_labels, X_test, test_labels, vocab_size, train_set.max_len)

In [27]:
baseline_3.run_birnn(epochs=7, batch_size=32)


Training RNN model...
Epoch 1/7




376/376 - 6s - 17ms/step - accuracy: 0.9541 - loss: 0.2042 - val_accuracy: 0.9610 - val_loss: 0.1676
Epoch 2/7
376/376 - 6s - 17ms/step - accuracy: 0.9547 - loss: 0.1920 - val_accuracy: 0.9610 - val_loss: 0.1645
Epoch 3/7
376/376 - 6s - 17ms/step - accuracy: 0.9548 - loss: 0.1723 - val_accuracy: 0.9610 - val_loss: 0.1793
Epoch 4/7
376/376 - 6s - 16ms/step - accuracy: 0.9567 - loss: 0.1332 - val_accuracy: 0.9602 - val_loss: 0.2100
Epoch 5/7
376/376 - 6s - 16ms/step - accuracy: 0.9746 - loss: 0.0800 - val_accuracy: 0.9532 - val_loss: 0.2173
Epoch 6/7
376/376 - 6s - 16ms/step - accuracy: 0.9849 - loss: 0.0478 - val_accuracy: 0.9579 - val_loss: 0.3211
Epoch 7/7
376/376 - 6s - 16ms/step - accuracy: 0.9830 - loss: 0.0504 - val_accuracy: 0.9252 - val_loss: 0.3896
Evaluating RNN model on test set...
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
RNN - Test Accuracy: 0.872, Macro F1: 0.503, Lie F1: 0.076
--------------------------------------------------


In [28]:
baseline_3.run_bilstm(epochs=7, batch_size=32)


Training LSTM model...
Epoch 1/7




376/376 - 14s - 37ms/step - accuracy: 0.9538 - loss: 0.2025 - val_accuracy: 0.9610 - val_loss: 0.1617
Epoch 2/7
376/376 - 14s - 38ms/step - accuracy: 0.9548 - loss: 0.1745 - val_accuracy: 0.9602 - val_loss: 0.1649
Epoch 3/7
376/376 - 15s - 40ms/step - accuracy: 0.9595 - loss: 0.1349 - val_accuracy: 0.9470 - val_loss: 0.2007
Epoch 4/7
376/376 - 15s - 39ms/step - accuracy: 0.9686 - loss: 0.1034 - val_accuracy: 0.9454 - val_loss: 0.2295
Epoch 5/7
376/376 - 15s - 40ms/step - accuracy: 0.9732 - loss: 0.0866 - val_accuracy: 0.9564 - val_loss: 0.2832
Epoch 6/7
376/376 - 15s - 39ms/step - accuracy: 0.9801 - loss: 0.0648 - val_accuracy: 0.9299 - val_loss: 0.3464
Epoch 7/7
376/376 - 15s - 39ms/step - accuracy: 0.9830 - loss: 0.0545 - val_accuracy: 0.9392 - val_loss: 0.2766
Evaluating LSTM model on test set...
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
LSTM - Test Accuracy: 0.891, Macro F1: 0.524, Lie F1: 0.106
--------------------------------------------------


In [29]:
baseline_3.run_bigru(epochs=7, batch_size=32)


Training GRU model...
Epoch 1/7




376/376 - 15s - 40ms/step - accuracy: 0.9543 - loss: 0.2064 - val_accuracy: 0.9610 - val_loss: 0.1631
Epoch 2/7
376/376 - 15s - 40ms/step - accuracy: 0.9548 - loss: 0.1703 - val_accuracy: 0.9610 - val_loss: 0.1730
Epoch 3/7
376/376 - 16s - 41ms/step - accuracy: 0.9568 - loss: 0.1323 - val_accuracy: 0.9595 - val_loss: 0.1827
Epoch 4/7
376/376 - 16s - 43ms/step - accuracy: 0.9660 - loss: 0.1040 - val_accuracy: 0.9564 - val_loss: 0.2062
Epoch 5/7
376/376 - 15s - 41ms/step - accuracy: 0.9738 - loss: 0.0786 - val_accuracy: 0.9602 - val_loss: 0.2565
Epoch 6/7
376/376 - 15s - 41ms/step - accuracy: 0.9822 - loss: 0.0563 - val_accuracy: 0.9462 - val_loss: 0.3199
Epoch 7/7
376/376 - 15s - 41ms/step - accuracy: 0.9867 - loss: 0.0414 - val_accuracy: 0.9306 - val_loss: 0.3566
Evaluating GRU model on test set...
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
GRU - Test Accuracy: 0.889, Macro F1: 0.523, Lie F1: 0.104
--------------------------------------------------
