In [1]:
import tensorflow as tf
# import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

[nltk_data] Downloading package punkt to /Users/juneechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# get word embeddings from the gensim package
import gensim.downloader

# download the glove embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [3]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

In [4]:
MAX_LEN = 350
EMBEDDING_DIM = 50
SAMPLE_SIZE = 1000

# read train and test data
train_df = util.read_data(TRAIN_PATH)
test_df = util.read_data(TEST_PATH)


In [5]:
# import pandas as pd
# # create a test dataframe of 3 rows with 'Title', 'Abstract', 'Keywords' columns
# titles = ['first sample', 'second sample', 'third sample']
# abstracts = ['He likes dogs. Dogs like him.', 'Hi, how are you? I\'m fine', 'Cactus is a plant. It is green. It has thorns.']
# keywords = ['likes dog, cat', 'hi, bye', 'cactus, plant']
# test_df = pd.DataFrame({'Title': titles, 'Abstract': abstracts, 'Keywords': keywords})


In [6]:

input_cols = ['Title', 'Abstract']

# process the data and sample some for testing
train_df = util.preprocess_data(train_df, input_cols, 'Keywords', sample_size=SAMPLE_SIZE)
test_df = util.preprocess_data(test_df, input_cols, 'Keywords', sample_size=SAMPLE_SIZE)
# test_df = util.preprocess_data(test_df, input_cols, 'Keywords')

# set up the tokenizer
tokenizer = util.setup_tokenizer(train_df, test_df, ['input_tokens', 'clean_kp'])

# create embeddings matrix
embeddings_matrix = util.get_embeddings_matrix(tokenizer, glove_vectors, EMBEDDING_DIM)
print("embeddings_matrix shape:", embeddings_matrix.shape)

# create the input array
train_X, train_Y = util.create_input_array(train_df, 'input_tokens', 'clean_kp', tokenizer,
                                           embeddings_matrix, EMBEDDING_DIM, MAX_LEN)

test_X, test_Y = util.create_input_array(test_df, 'input_tokens', 'clean_kp', tokenizer,
                                            embeddings_matrix, EMBEDDING_DIM, MAX_LEN)

embeddings_matrix shape: (21759, 50)


In [7]:
print(test_df.head(1))
print(test_Y[0])

           id                                              Title  \
7782  2802639  Exploring Digital Environments for Research in...   

                                   Subtitle  \
7782  Metaphorical Models of Sustainability   

                                               Abstract  \
7782  This paper is a reflection on the meaning assu...   

                                               Keywords  \
7782  ['metaphor', 'digital ecosystem', 'digital env...   

                                           input_tokens  \
7782  [exploring, digital, environment, research, ph...   

                                               clean_kp  
7782  [metaphor, digital ecosystem, digital environm...  
[0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [8]:
print("Number of words in the vocabulary:", len(tokenizer.word_index))

print(tokenizer.index_word[0])

Number of words in the vocabulary: 21759
<PAD>


In [9]:
# find the max length of the input sequences
max_length = max([len(seq) for seq in train_df['input_tokens']])
print("samples max length:", max_length)

# print(tokenizer.word_index['present'])
print(train_X.shape)
# print(train_X[0])
print(train_Y.shape)
# print(train_Y[0])

samples max length: 352
(1000, 350, 50)
(1000, 350)


In [10]:
# build bi-LSTM model
model = Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(MAX_LEN, EMBEDDING_DIM)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(MAX_LEN, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


2023-11-28 21:03:26.750433: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2023-11-28 21:03:26.750455: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-11-28 21:03:26.750465: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-11-28 21:03:26.750502: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-28 21:03:26.750522: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 350, 128)          58880     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 350)               45150     
                                                                 
Total params: 202846 (792.37 KB)
Trainable params: 202846 (792.37 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# fit model
model.fit(train_X, train_Y, batch_size=32, epochs=3)

Epoch 1/3


2023-11-28 21:03:29.756317: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2c829f8d0>

In [20]:
# convert prediction to keywords
def pred_to_keywords(pred, input_seq, tokenizer):
    # convert prediction to binary with a threshold of 0.5
    threshold = 0.5
    binary_pred = (pred > threshold).astype(int)
    
    print()
    print(binary_pred)
    
    # convert binary prediction to keywords
    keywords = []
    for i in range(len(binary_pred)):
        # break reaches the end of the sequence before the padding part
        if input_seq[i] == 0:
            break
        if binary_pred[i] == 1:
            # print("keyword at:", i)
            # print("int rep.:", input_seq[i])
            keywords.append(input_seq[i])
    
    return keywords

In [32]:
# print the 10th from test_df['input_tokens']
print(test_df['input_tokens'][10])

KeyError: 10

In [30]:
# testing prediction
preds = model.predict(test_X[30:40])

print(len(preds))

print(preds[2])

# print prediction
for i in range(len(preds)):
    # preds = pred_to_keywords(preds[i], test_X[i], tokenizer)
    # print("actual:", test_Y[i])
    # print('\n')


10
[0.26231793 0.48207167 0.44653508 0.3935192  0.3690132  0.34691653
 0.33166832 0.23550364 0.21490908 0.20689146 0.18273778 0.22209914
 0.20141366 0.16921656 0.15237446 0.16120364 0.22195382 0.1874457
 0.15076537 0.13494745 0.13237597 0.15855315 0.14306295 0.16324091
 0.1474096  0.14197162 0.13881092 0.15788628 0.1246698  0.17907822
 0.16858432 0.14331064 0.14530788 0.13737303 0.11727284 0.11831699
 0.11315963 0.15240444 0.13218646 0.15536463 0.17109409 0.14549209
 0.13875984 0.10446545 0.12877697 0.13565186 0.1237336  0.11148472
 0.10239498 0.13656078 0.1241136  0.11387753 0.11202489 0.16075163
 0.10138713 0.13736664 0.12498087 0.12998569 0.12132523 0.09023359
 0.10487595 0.12905304 0.12570801 0.11449626 0.08769716 0.1036898
 0.11866491 0.10122868 0.10173535 0.1235968  0.11912727 0.10580643
 0.08250937 0.09242459 0.12357452 0.0856499  0.09371005 0.09540904
 0.09427721 0.1170148  0.08527968 0.08033369 0.06775986 0.06803577
 0.0719155  0.07403903 0.06141327 0.05858996 0.06588446 0.075