In [1]:
import tensorflow as tf
# import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

In [2]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

MAX_LEN = 350

In [3]:
# read train and test data
train_df = util.read_data(TRAIN_PATH)
test_df = util.read_data(TEST_PATH)


In [16]:
# apply clean_keywords to each row of df['Keywords']
train_df['Keywords'] = train_df['Keywords'].apply(util.clean_keywords)
test_df['Keywords'] = test_df['Keywords'].apply(util.clean_keywords)

# set up train data and labels, test on part of the train samples
train_abs = util.tokenize_sentence(train_df.head(50000)['Abstract'])
train_kws = train_df.head(50000)['Keywords']

# set up train data and labels
# train_abs = util.tokenize_sentence(train_df['Abstract'])
# train_kws = train_df['Keywords']

# set up test data and labels
test_abs = util.tokenize_sentence(test_df['Abstract'])
test_kws = test_df['Keywords']

In [17]:
print(train_abs[:5])
print(train_kws[:5])

[['The', 'present', 'paper', 'discusses', 'how', 'clone', 'sets', 'can', 'be', 'generated', 'from', 'an', 'very', 'large', 'amount', 'of', 'source', 'code', '.', 'The', 'knowledge', 'of', 'clone', 'sets', 'can', 'help', 'to', 'manage', 'software', 'asset', '.', 'For', 'example', ',', 'we', 'can', 'figure', 'out', 'the', 'state', 'of', 'the', 'asset', 'easier', ',', 'or', 'we', 'can', 'build', 'more', 'useful', 'libraries', 'based', 'on', 'the', 'knowledge', '.'], ['Based', 'on', 'the', 'important', 'progresses', 'made', 'in', 'information', 'retrieval', '(', 'IR', ')', 'in', 'terms', 'of', 'theoretical', 'models', 'and', 'evaluations', ',', 'more', 'and', 'more', 'attention', 'has', 'recently', 'been', 'paid', 'to', 'the', 'research', 'in', 'domain', 'specific', 'IR', ',', 'as', 'evidenced', 'by', 'the', 'organization', 'of', 'Genomics', 'and', 'Legal', 'tracks', 'in', 'TREC', '(', 'Text', 'REtrieval', 'Conference', ')', '.', 'We', 'think', 'that', 'now', 'is', 'the', 'right', 'time', 

In [18]:
print(len(train_abs))
print(len(train_kws))
print(len(test_abs))
print(len(test_kws))

50000
50000
18616
18616


In [19]:
# set up tokenizer, fit on both train and test data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_abs)
tokenizer.fit_on_texts(test_abs)

# set up train sequences (with padding)
train_sequences = tokenizer.texts_to_sequences(train_abs)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post')
# set up train labels
train_y = util.keywords_marking(train_kws, train_x, 350, tokenizer)

# set up test sequences (with padding)
test_sequences = tokenizer.texts_to_sequences(test_abs)
test_x = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post')
# set up test labels
test_y = util.keywords_marking(test_kws, test_x, 350, tokenizer)


In [20]:
# run some checks
print(train_sequences[:5])

# print(tokenizer.word_index)

# find the max length of the sequences
max_length = max([len(seq) for seq in train_sequences])
print(max_length)

# find index of a word 
print(tokenizer.word_index['present'])
# find word at index 
print(tokenizer.index_word[51])

print(train_sequences[0])
print(train_y[0])

[[1, 57, 25, 1419, 59, 5163, 494, 22, 23, 484, 27, 20, 209, 110, 502, 4, 311, 140, 2, 1, 153, 4, 5163, 494, 22, 284, 6, 1290, 73, 5302, 2, 10, 357, 3, 9, 22, 4217, 343, 1, 314, 4, 1, 5302, 1908, 3, 34, 9, 22, 545, 54, 431, 1379, 39, 14, 1, 153, 2], [39, 14, 1, 134, 8113, 518, 8, 37, 321, 17, 2082, 15, 8, 260, 4, 829, 106, 5, 1305, 3, 54, 5, 54, 516, 41, 537, 68, 3584, 6, 1, 81, 8, 308, 268, 2082, 3, 18, 9112, 21, 1, 1176, 4, 7241, 5, 2562, 3022, 8, 3417, 17, 327, 321, 2258, 15, 2, 9, 3157, 12, 976, 11, 1, 1616, 61, 6, 2355, 343, 110, 475, 1305, 14, 7519, 533, 8, 135, 6, 2111, 1, 81, 8, 4917, 2082, 8, 281, 5, 4917, 4625, 2082, 8, 223, 2, 2969, 3, 9, 77, 1, 1176, 4, 7, 4917, 2082, 1314, 8, 3417, 8, 135, 6, 258, 1, 222, 8, 4917, 5, 4625, 2082, 2, 8, 13, 906, 25, 3, 9, 57, 1, 81, 631, 9, 90, 258, 8, 1, 72, 1314, 3, 24, 571, 1635, 4, 1, 72, 1314, 3, 5, 1, 1360, 4, 108, 218, 9, 77, 10, 1, 1314, 2, 9, 285, 14, 1, 40, 4, 7, 53, 4917, 1465, 108, 162, 1782, 4, 66, 11820, 3, 594, 3, 4917, 1465, 1

In [21]:
# build bi-LSTM model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, output_dim=100, input_length=MAX_LEN))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(MAX_LEN, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 350, 100)          14339300  
                                                                 
 bidirectional_2 (Bidirecti  (None, 350, 128)          84480     
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 350)               45150     
                                                                 
Total params: 14567746 (55.57 MB)
Trainable params: 14567746 (55.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# fit model
model.fit(train_x, train_y, batch_size=32, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2d11730d0>

In [38]:
# testing prediction
preds = model.predict(test_x[30:40])

# print(preds[0])

# print prediction
for i in range(len(preds)):
    print("pred:", util.pred_to_keywords(preds[i], test_x[i], tokenizer))
    print("actual:", test_kws[i])
    print('\n')


pred: ['massive']
actual: asic, cad, eda, layout, logic, mooc, vlsi


pred: ['and']
actual: attitudes, e-participation, gamification, public participation, usage behavior


pred: []
actual: anonymous, conversation, cues, voting


pred: []
actual: electromagnetism, evolutionary algorithms, multi-objective optimization, resource-constrained project scheduling


pred: ['the']
actual: consciousness, constraint, creativity, digital fine art, freedom


pred: []
actual: energy use, feedback, interaction design, persuasive computing, sustainability, visualization


pred: []
actual: guided search, model checking, verification


pred: []
actual: xml, digital preservation, integration, web service


pred: ['in', 'an']
actual: architecture, software ecosystem, software product lines, variability modeling


pred: ['although', 'sensor', 'networks']
actual: 3d-localization, delaunay triangulation, map construction, rssi, terrain modeling, wireless sensor networks


