In [1]:
import json

def load_data(file_path):
    X_data = []
    Y_data = []
    with open(file_path, 'r') as f:
        for line in f:
            example = json.loads(line.strip())
            X_data.append(example['text'])
            Y_data.append(example['label'])
    return X_data, Y_data

X_train, Y_train = load_data('../mgnns/train_all_anno.json')
X_test, Y_test = load_data('../mgnns/test_all_anno.json')
X_val, Y_val = load_data('../mgnns/val_all_anno.json')

X_train = [x.lower() for x in X_train]
X_test = [x.lower() for x in X_test]
X_val = [x.lower() for x in X_val]

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = '../mgnns/glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))


from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y_train_encoded = lb.fit_transform(Y_train)
Y_val_encoded = lb.transform(Y_val)
Y_test_encoded = lb.transform(Y_test)


In [3]:
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=True))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy', f1_score])
model.summary()

# save best model when training
checkpoint = ModelCheckpoint('lstm_avg_single.h5', monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[checkpoint, reduce_lr, early_stop])


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 300)          2571000   
                                                                 
 lstm_8 (LSTM)               (None, 100, 128)          219648    
                                                                 
 dropout_12 (Dropout)        (None, 100, 128)          0         
                                                                 
 lstm_9 (LSTM)               (None, 100, 64)           49408     
                                                                 
 global_average_pooling1d_4   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_13 (Dropout)        (None, 64)                0         
                                                      

In [17]:
# evaluate the model
test_loss, test_acc, test_f1 = model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)


Test loss:  0.9418776035308838
Test accuracy:  0.6290322542190552
Test f1 score:  0.6036418676376343


In [18]:
#evaluate the best model
from tensorflow.keras.models import load_model
best_model = load_model('lstm_avg_single.h5', custom_objects={'f1_score': f1_score})
test_loss, test_acc, test_f1 = best_model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)

Test loss:  0.9004533290863037
Test accuracy:  0.5806451439857483
Test f1 score:  0.5438573956489563


# multi

In [7]:
# load data from output_multi.csv
import pandas as pd
df = pd.read_csv('./output_multi.csv')

X = df['RawText'].tolist()
Y = df['Label'].tolist()

# split data, 8:1:1
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=.5, random_state=42)

X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]
X_val = [str(text) for text in X_val]

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = '../mgnns/glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))


from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y_train_encoded = lb.fit_transform(Y_train)
Y_val_encoded = lb.transform(Y_val)
Y_test_encoded = lb.transform(Y_test)


In [9]:

from tensorflow.keras import backend as K

def f1_score(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=True))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss=CategoricalCrossentropy(), optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy', f1_score])
model.summary()

# save best model when training
checkpoint = ModelCheckpoint('lstm_avg_multi.h5', monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_train_padded, Y_train_encoded, epochs=30, batch_size=32, validation_data=(X_val_padded, Y_val_encoded), callbacks=[checkpoint, reduce_lr, early_stop])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 300)          4329000   
                                                                 
 lstm_2 (LSTM)               (None, 100, 128)          219648    
                                                                 
 dropout_3 (Dropout)         (None, 100, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 100, 64)           49408     
                                                                 
 global_average_pooling1d_1   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                      

In [11]:
# evaluate the model
test_loss, test_acc, test_f1 = model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)


Test loss:  0.7731484770774841
Test accuracy:  0.6596614122390747
Test f1 score:  0.6521373391151428


In [12]:
#evaluate the best model
from tensorflow.keras.models import load_model
best_model = load_model('lstm_avg_multi.h5', custom_objects={'f1_score': f1_score})
test_loss, test_acc, test_f1 = best_model.evaluate(X_test_padded, Y_test_encoded, batch_size=64)
print('Test loss: ', test_loss)
print('Test accuracy: ', test_acc)
print('Test f1 score: ', test_f1)

Test loss:  0.6647664904594421
Test accuracy:  0.707530677318573
Test f1 score:  0.7078070640563965
