In [1]:
import json

def load_data(file_path):
    X_data = []
    Y_data = []
    with open(file_path, 'r') as f:
        for line in f:
            example = json.loads(line.strip())
            X_data.append(example['text'])
            Y_data.append(example['label'])
    return X_data, Y_data

X_train, Y_train = load_data('../mgnns/train_all_anno.json')
X_test, Y_test = load_data('../mgnns/test_all_anno.json')
X_val, Y_val = load_data('../mgnns/val_all_anno.json')

X_train = [x.lower() for x in X_train]
X_test = [x.lower() for x in X_test]
X_val = [x.lower() for x in X_val]

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = '../mgnns/glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))


from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y_train_encoded = lb.fit_transform(Y_train)
Y_val_encoded = lb.transform(Y_val)
Y_test_encoded = lb.transform(Y_test)


In [3]:
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy', f1_score])
model.summary()

# save best model when training
checkpoint = ModelCheckpoint('bilstm.h5', monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[checkpoint, reduce_lr, early_stop])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          2571000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 256)         439296    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8

In [5]:
# evaluate the model
test_loss, test_acc, test_f1 = model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)


Test loss:  1.6201343536376953
Test accuracy:  0.599078357219696
Test f1 score:  0.5966647267341614


In [6]:
#evaluate the best model
from tensorflow.keras.models import load_model
best_model = load_model('bilstm.h5', custom_objects={'f1_score': f1_score})
test_loss, test_acc, test_f1 = best_model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)

Test loss:  1.1842362880706787
Test accuracy:  0.6290322542190552
Test f1 score:  0.6232410073280334


# multi

In [1]:
# load data from output_multi.csv
import pandas as pd
df = pd.read_csv('./output_multi.csv')

X = df['RawText'].tolist()
Y = df['Label'].tolist()

# split data, 8:1:1
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=.5, random_state=42)

X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]
X_val = [str(text) for text in X_val]

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = '../mgnns/glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))


from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y_train_encoded = lb.fit_transform(Y_train)
Y_val_encoded = lb.transform(Y_val)
Y_test_encoded = lb.transform(Y_test)


In [3]:

from tensorflow.keras import backend as K

def f1_score(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy', f1_score])
model.summary()

# save best model when training
checkpoint = ModelCheckpoint('bilstm_multi.h5', monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[checkpoint, reduce_lr, early_stop])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          4329000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 256)         439296    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8

In [5]:
# evaluate the model
test_loss, test_acc, test_f1 = model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)


Test loss:  1.0849939584732056
Test accuracy:  0.6870986819267273
Test f1 score:  0.6798893809318542


In [6]:
#evaluate the best model
from tensorflow.keras.models import load_model
best_model = load_model('bilstm_multi.h5', custom_objects={'f1_score': f1_score})
test_loss, test_acc, test_f1 = best_model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)

Test loss:  0.6697537899017334
Test accuracy:  0.7092819809913635
Test f1 score:  0.7079153656959534


# Ensemble learning

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

num_models = 5 # number of models in the ensemble

models = [] # list to hold the models
histories = [] # list to hold the training histories of the models

for i in range(num_models):
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy', f1_score])

    checkpoint = ModelCheckpoint('bilstm_multi_{}.h5'.format(i), monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
    early_stop = EarlyStopping(monitor='val_loss', patience=10)

    history = model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[checkpoint, reduce_lr, early_stop])

    best_model = load_model('bilstm_multi_{}.h5'.format(i), custom_objects={'f1_score': f1_score})
    models.append(model)
    histories.append(history)


Epoch 1/30
Epoch 1: val_f1_score improved from -inf to 0.69344, saving model to bilstm_multi_0.h5
Epoch 2/30
Epoch 2: val_f1_score did not improve from 0.69344
Epoch 3/30
Epoch 3: val_f1_score improved from 0.69344 to 0.71341, saving model to bilstm_multi_0.h5
Epoch 4/30
Epoch 4: val_f1_score improved from 0.71341 to 0.71430, saving model to bilstm_multi_0.h5
Epoch 5/30
Epoch 5: val_f1_score did not improve from 0.71430
Epoch 6/30
Epoch 6: val_f1_score did not improve from 0.71430
Epoch 7/30
Epoch 7: val_f1_score did not improve from 0.71430
Epoch 8/30
Epoch 8: val_f1_score did not improve from 0.71430
Epoch 9/30
Epoch 9: val_f1_score did not improve from 0.71430
Epoch 10/30
Epoch 10: val_f1_score did not improve from 0.71430
Epoch 11/30
Epoch 11: val_f1_score did not improve from 0.71430
Epoch 12/30
Epoch 12: val_f1_score did not improve from 0.71430
Epoch 13/30
Epoch 13: val_f1_score did not improve from 0.71430
Epoch 1/30
Epoch 1: val_f1_score improved from -inf to 0.70826, saving m

In [11]:
for mod in models:
    test_loss, test_acc, test_f1 = mod.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
    print('Test loss: ', test_loss)
    print('Test accuracy: ', test_acc)
    print('Test f1 score: ', test_f1)

Test loss:  1.0549625158309937
Test accuracy:  0.6795096397399902
Test f1 score:  0.6776270866394043
Test loss:  0.891429603099823
Test accuracy:  0.6952714323997498
Test f1 score:  0.6949810981750488
Test loss:  1.006123423576355
Test accuracy:  0.6824284791946411
Test f1 score:  0.680793285369873
Test loss:  1.1993995904922485
Test accuracy:  0.6987740993499756
Test f1 score:  0.6993421912193298
Test loss:  3.8808786869049072
Test accuracy:  0.3765324056148529
Test f1 score:  0.37577807903289795


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np

# Get the prediction of each model on the validation data
predictions = []
for model in models:
    pred = model.predict(X_val_padded)
    predictions.append(pred)

# Stack predictions horizontally (each model's predictions are a column)
predictions = np.hstack(predictions)

# Train a logistic regression model on the stacked predictions
meta_model = LogisticRegression()
meta_model.fit(predictions, Y_val_encoded.argmax(axis=1))

# Get the predictions of each model on the test data
predictions_test = []
for model in models:
    pred = model.predict(X_test_padded)
    predictions_test.append(pred)

# Stack predictions horizontally (each model's predictions are a column)
predictions_test = np.hstack(predictions_test)

# Predict the classes on the test data using the meta-model
y_pred = meta_model.predict(predictions_test)

# Calculate and print the F1 score
macro_f1_score = f1_score(Y_test_encoded.argmax(axis=1), y_pred, average='macro')
print('F1 score: ', macro_f1_score)


F1 score:  0.36214364635367985


In [15]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import Precision, Recall
import numpy as np

num_models = 5  # number of models in the ensemble

# Function to create and train a BiLSTM model
def create_and_train_bilstm_model():
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy'])

    # checkpoint = ModelCheckpoint('bilstm_multi.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
    early_stop = EarlyStopping(monitor='val_loss', patience=10)

    model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[reduce_lr, early_stop])
    
    return model

# Train multiple BiLSTM models and save their predictions
models = []
train_predictions = []
for i in range(num_models):
    model = create_and_train_bilstm_model()
    models.append(model)
    pred = model.predict(X_train_padded)
    train_predictions.append(pred)

# Stack predictions horizontally (each model's predictions are a column)
train_predictions = np.hstack(train_predictions)

# Define the architecture of the meta-model
inputs = Input(shape=(train_predictions.shape[1],))
x = Dense(64, activation='relu')(inputs)
outputs = Dense(3, activation='softmax')(x)
meta_model = Model(inputs, outputs)

# Compile the meta-model
meta_model.compile(optimizer=RMSprop(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the meta-model on the training predictions and the true labels
meta_model.fit(train_predictions, Y_train_encoded, epochs=30, batch_size=32)

# Get the predictions of each model on the test data
test_predictions = []
for model in models:
    pred = model.predict(X_test_padded)
    test_predictions.append(pred)

# Stack predictions horizontally (each model's predictions are a column)
test_predictions = np.hstack(test_predictions)

# Predict the classes on the test data using the meta-model
y_pred = np.argmax(meta_model.predict(test_predictions), axis=-1)

# Calculate and print the F1 score
macro_f1_score = f1_score(Y_test_encoded.argmax(axis=1), y_pred, average='macro')
print('F1 score: ', macro_f1_score)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/3