In [3]:
import tensorflow as tf
# import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

[nltk_data] Downloading package punkt to /Users/juneechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# get word embeddings from the gensim package
import gensim.downloader

# download the glove embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [14]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

In [15]:
MAX_LEN = 350
EMBEDDING_DIM = 50
SAMPLE_SIZE = 1000


In [17]:

# read train and test data
train_df = util.read_data(TRAIN_PATH)
test_df = util.read_data(TEST_PATH)

input_cols = ['Title', 'Abstract']

# process the data and sample some for testing; it changes to df in place. 
# Reload df if running again
train_df = util.preprocess_data(train_df, input_cols, 'Keywords', sample_size=SAMPLE_SIZE)
test_df = util.preprocess_data(test_df, input_cols, 'Keywords', sample_size=SAMPLE_SIZE)
# test_df = util.preprocess_data(test_df, input_cols, 'Keywords')

# set up the tokenizer
tokenizer = util.setup_tokenizer(train_df, test_df, ['input_tokens', 'clean_kp'])

# create embeddings matrix
embeddings_matrix = util.get_embeddings_matrix(tokenizer, glove_vectors, EMBEDDING_DIM)

# create the input array
train_X, train_Y = util.create_input_array(train_df, 'input_tokens', 'clean_kp', tokenizer,
                                           embeddings_matrix, EMBEDDING_DIM, MAX_LEN)

test_X, test_Y = util.create_input_array(test_df, 'input_tokens', 'clean_kp', tokenizer,
                                            embeddings_matrix, EMBEDDING_DIM, MAX_LEN)


embeddings_matrix shape: (21944, 50)


In [41]:
print("embeddings_matrix shape:", embeddings_matrix.shape)
print("train_X shape:", train_X.shape)
print("train_Y shape:", train_Y.shape)
print("test_X shape:", test_X.shape)
print("test_Y shape:", test_Y.shape)

# print(test_df.head(1))
# print(test_Y[0])

print("Number of words in the vocabulary:", len(tokenizer.word_index))
# print(tokenizer.index_word[0])

# find the max length of the input sequences
max_length = max([len(seq) for seq in train_df['input_tokens']])
print("original samples max length:", max_length)

embeddings_matrix shape: (21944, 50)
train_X shape: (1000, 350, 50)
train_Y shape: (1000, 350)
test_X shape: (1000, 350, 50)
test_Y shape: (1000, 350)
Number of words in the vocabulary: 21944
original samples max length: 448


In [23]:
# build bi-LSTM model
model = Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(MAX_LEN, EMBEDDING_DIM)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(MAX_LEN, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirecti  (None, 350, 128)          58880     
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 350)               45150     
                                                                 
Total params: 202846 (792.37 KB)
Trainable params: 202846 (792.37 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# fit model
model.fit(train_X, train_Y, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x297e42010>

In [39]:
# testing prediction
preds = model.predict(test_X)

print(len(preds))

# print prediction
pred_kws = util.pred_to_keywords(preds, test_df['input_tokens'].values)
print(pred_kws)

1000
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []