In [1]:
import tensorflow as tf
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

In [2]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

MAX_LEN = 350

In [3]:
df = util.read_data(TRAIN_PATH)


In [4]:
# apply clean_keywords to each row of df['Keywords']
df['Keywords'] = df['Keywords'].apply(util.clean_keywords)

# test on the first 5 row of abstract and keywords
abstract = util.tokenize_sentence(df.head(5)['Abstract'])
keywords = df.head(5)['Keywords']

print(abstract)
print(keywords)

[['The', 'present', 'paper', 'discusses', 'how', 'clone', 'sets', 'can', 'be', 'generated', 'from', 'an', 'very', 'large', 'amount', 'of', 'source', 'code', '.', 'The', 'knowledge', 'of', 'clone', 'sets', 'can', 'help', 'to', 'manage', 'software', 'asset', '.', 'For', 'example', ',', 'we', 'can', 'figure', 'out', 'the', 'state', 'of', 'the', 'asset', 'easier', ',', 'or', 'we', 'can', 'build', 'more', 'useful', 'libraries', 'based', 'on', 'the', 'knowledge', '.'], ['Based', 'on', 'the', 'important', 'progresses', 'made', 'in', 'information', 'retrieval', '(', 'IR', ')', 'in', 'terms', 'of', 'theoretical', 'models', 'and', 'evaluations', ',', 'more', 'and', 'more', 'attention', 'has', 'recently', 'been', 'paid', 'to', 'the', 'research', 'in', 'domain', 'specific', 'IR', ',', 'as', 'evidenced', 'by', 'the', 'organization', 'of', 'Genomics', 'and', 'Legal', 'tracks', 'in', 'TREC', '(', 'Text', 'REtrieval', 'Conference', ')', '.', 'We', 'think', 'that', 'now', 'is', 'the', 'right', 'time', 

In [7]:
# set up tokenizer 
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(abstract)
sequences = tokenizer.texts_to_sequences(abstract)

# set up train sequences (with padding)
train_x = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
# set up train labels
train_y = util.keywords_marking(keywords, sequences, 350, tokenizer)


keyword_phrase: code clone, reengineering for libraries
phrases: ['code clone', 'reengineering for libraries']
phrase_tokens: [[16, 77], [10, 151]]
token: 16
at: 17
token: 77
at: 5
token: 10
at: 31
token: 151
at: 51
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [6]:
# run some checks
print(sequences)

# print(tokenizer.word_index)

# find the max length of the sequences
max_length = max([len(seq) for seq in sequences])
print(max_length)

# find index of a word 
print(tokenizer.word_index['present'])
# find word at index 
print(tokenizer.index_word[51])

print(sequences[0])
print(train_y[0])

[[1, 76, 51, 138, 139, 77, 78, 19, 29, 140, 79, 22, 80, 36, 141, 4, 142, 16, 5, 1, 52, 4, 77, 78, 19, 81, 7, 143, 144, 82, 5, 10, 145, 2, 9, 19, 146, 83, 1, 147, 4, 1, 82, 148, 2, 23, 9, 19, 149, 24, 150, 151, 53, 30, 1, 52, 5], [53, 30, 1, 84, 152, 153, 3, 54, 85, 11, 25, 12, 3, 154, 4, 155, 156, 6, 86, 2, 24, 6, 24, 157, 37, 158, 26, 159, 7, 1, 17, 3, 55, 87, 25, 2, 13, 160, 38, 1, 88, 4, 161, 6, 162, 163, 3, 89, 11, 164, 85, 165, 12, 5, 9, 166, 14, 167, 15, 1, 168, 90, 7, 169, 83, 36, 170, 86, 30, 171, 172, 3, 31, 7, 173, 1, 17, 3, 20, 25, 3, 174, 6, 20, 91, 25, 3, 92, 5, 175, 2, 9, 39, 1, 88, 4, 8, 20, 25, 32, 3, 89, 3, 31, 7, 93, 1, 176, 3, 20, 6, 91, 25, 5, 3, 40, 177, 51, 2, 9, 76, 1, 17, 178, 9, 56, 93, 3, 1, 57, 32, 2, 41, 179, 180, 4, 1, 57, 32, 2, 6, 1, 181, 4, 42, 33, 9, 39, 10, 1, 32, 5, 9, 182, 30, 1, 58, 4, 8, 94, 20, 59, 42, 27, 183, 4, 60, 184, 2, 95, 2, 20, 59, 42, 6, 20, 59, 185, 42, 5], [96, 1, 97, 60, 186, 2, 8, 187, 188, 4, 17, 3, 189, 190, 191, 11, 43, 12, 37, 26