In [1]:
import tensorflow as tf
# import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

[nltk_data] Downloading package punkt to /Users/juneechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# get word embeddings from the gensim package
import gensim.downloader

# download the glove embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [3]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

In [21]:
MAX_LEN = 350
EMBEDDING_DIM = 50
SAMPLE_SIZE = 1000

# read train and test data
train_df = util.read_data(TRAIN_PATH)
test_df = util.read_data(TEST_PATH)

input_cols = ['Title', 'Abstract']

# process the data and sample 10 for testing
train_df = util.preprocess_data(train_df, input_cols, 'Keywords', sample_size=SAMPLE_SIZE)

# set up the tokenizer
tokenizer = util.setup_tokenizer(train_df, ['input_tokens', 'clean_kp'])

# get the embeddings matrix
embeddings_matrix = util.get_embeddings_matrix(tokenizer, glove_vectors, EMBEDDING_DIM)
print("embeddings_matrix shape:", embeddings_matrix.shape)

# create the input array
train_X, train_Y = util.create_input_array(train_df, 'input_tokens', 'clean_kp', tokenizer,
                                           embeddings_matrix, EMBEDDING_DIM, MAX_LEN)

embeddings_matrix shape: (13965, 50)


In [22]:
# find the max length of the input sequences
max_length = max([len(seq) for seq in train_df['input_tokens']])
print("max_length:", max_length)

print(tokenizer.word_index['present'])
print(train_X.shape)
print(train_X[0])
print(train_Y.shape)
print(train_Y[9])

max_length: 305
14
(1000, 350, 50)
[[ 0.29751     0.42748001 -1.10710001 ...  1.1904      0.19129001
   0.22145   ]
 [-0.085318   -0.55786002  0.85042    ...  0.098791    0.17428
   0.22194999]
 [ 0.77967    -0.17454     1.65769994 ... -0.003134   -0.28180999
  -0.48699999]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
(1000, 350)
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [29]:
# build bi-LSTM model
model = Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(MAX_LEN, EMBEDDING_DIM)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(MAX_LEN, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_10 (Bidirect  (None, 350, 128)          58880     
 ional)                                                          
                                                                 
 bidirectional_11 (Bidirect  (None, 128)               98816     
 ional)                                                          
                                                                 
 dense_5 (Dense)             (None, 350)               45150     
                                                                 
Total params: 202846 (792.37 KB)
Trainable params: 202846 (792.37 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
# fit model
model.fit(train_X, train_Y, batch_size=32, epochs=3)

Epoch 1/3


2023-11-28 19:13:48.045198: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2c88a9ad0>

In [38]:
# # testing prediction
# preds = model.predict(test_x[30:40])

# # print(preds[0])

# # print prediction
# for i in range(len(preds)):
#     print("pred:", util.pred_to_keywords(preds[i], test_x[i], tokenizer))
#     print("actual:", test_kws[i])
#     print('\n')


pred: ['massive']
actual: asic, cad, eda, layout, logic, mooc, vlsi


pred: ['and']
actual: attitudes, e-participation, gamification, public participation, usage behavior


pred: []
actual: anonymous, conversation, cues, voting


pred: []
actual: electromagnetism, evolutionary algorithms, multi-objective optimization, resource-constrained project scheduling


pred: ['the']
actual: consciousness, constraint, creativity, digital fine art, freedom


pred: []
actual: energy use, feedback, interaction design, persuasive computing, sustainability, visualization


pred: []
actual: guided search, model checking, verification


pred: []
actual: xml, digital preservation, integration, web service


pred: ['in', 'an']
actual: architecture, software ecosystem, software product lines, variability modeling


pred: ['although', 'sensor', 'networks']
actual: 3d-localization, delaunay triangulation, map construction, rssi, terrain modeling, wireless sensor networks


