In [1]:
import tensorflow as tf
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import util as util

In [2]:
TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'
DEV_PATH = '../dataset/SciHTC/dev_title_abstract_keywords.csv'

MAX_LEN = 350

In [3]:
df = util.read_data(TRAIN_PATH)


In [4]:
print(df.head(5)['Abstract'])

0    The present paper discusses how clone sets can...
1    Based on the important progresses made in info...
2    Over the past two decades, a growing body of r...
3     Refactoring is an important way to improve th...
4    Productivity is the main concern of today's IT...
Name: Abstract, dtype: object


In [5]:
# s1 = "i am cat"
# s2 = "i am dog"
# s3 = "i am human"
# sentences = [s1, s2, s3]    
# tokenizer = tf.keras.preprocessing.text.Tokenizer()
# tokenizer = tf.keras.preprocessing.text.Tokenizer(sentences)
# tokenizer.fit_on_texts(sentences)
# sequences = tokenizer.texts_to_sequences(sentences)


In [6]:
abstract = util.tokenize_sentence(df.head(5)['Abstract'])

In [7]:
print(abstract)

[['The', 'present', 'paper', 'discusses', 'how', 'clone', 'sets', 'can', 'be', 'generated', 'from', 'an', 'very', 'large', 'amount', 'of', 'source', 'code', '.', 'The', 'knowledge', 'of', 'clone', 'sets', 'can', 'help', 'to', 'manage', 'software', 'asset', '.', 'For', 'example', ',', 'we', 'can', 'figure', 'out', 'the', 'state', 'of', 'the', 'asset', 'easier', ',', 'or', 'we', 'can', 'build', 'more', 'useful', 'libraries', 'based', 'on', 'the', 'knowledge', '.'], ['Based', 'on', 'the', 'important', 'progresses', 'made', 'in', 'information', 'retrieval', '(', 'IR', ')', 'in', 'terms', 'of', 'theoretical', 'models', 'and', 'evaluations', ',', 'more', 'and', 'more', 'attention', 'has', 'recently', 'been', 'paid', 'to', 'the', 'research', 'in', 'domain', 'specific', 'IR', ',', 'as', 'evidenced', 'by', 'the', 'organization', 'of', 'Genomics', 'and', 'Legal', 'tracks', 'in', 'TREC', '(', 'Text', 'REtrieval', 'Conference', ')', '.', 'We', 'think', 'that', 'now', 'is', 'the', 'right', 'time', 

In [8]:

# tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(abstract)
sequences = tokenizer.texts_to_sequences(abstract)
train_x = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

In [9]:
print(type(df.head(5)['Keywords'][1]))
# remove [, ], ', and split by comma
# keywords = [keywords.replace('[', '').replace(']', '').replace('\'', '').split(', ') for keywords in df.head(5)['Keywords']]


<class 'str'>


In [10]:
# print(keywords)

# apply clean_keywords to each row of df['Keywords']
df['Keywords'] = df['Keywords'].apply(util.clean_keywords)

In [11]:

keywords = df.head(5)['Keywords']
print(keywords)

# convert keywords to list of list
for each in keywords:
    print(each)

0     [[code, clone], [reengineering, for, libraries]]
1    [[chemical, information, retrieval], [chemistr...
2    [[ambulatory, assessment], [borderline, person...
3    [[refactoring], [software, evolution], [vector...
4    [[information, context], [productivity], [task...
Name: Keywords, dtype: object
[['code', 'clone'], ['reengineering', 'for', 'libraries']]
[['chemical', 'information', 'retrieval'], ['chemistry'], ['patent', 'retrieval'], ['prior', 'art', 'search']]
[['ambulatory', 'assessment'], ['borderline', 'personality', 'disorder'], ['clinical', 'psychology'], ['everyday', 'life'], ['interactive', 'feedback'], ['real-time', 'data', 'interpretation'], ['unobtrusive', 'objective', 'assessment']]
[['refactoring'], ['software', 'evolution'], ['vector-based', 'representation']]
[['information', 'context'], ['productivity'], ['task'], ['task', 'modeling']]


In [12]:
kw = [['code', 'clone'], ['reengineering', 'for', 'libraries']]
kw_seq = tokenizer.texts_to_sequences(kw)

In [13]:
print(kw_seq)

[[16, 77], [10, 151]]


In [21]:

def keywords_marking_exact(keyword_phrases: list, sequence: list, max_len: int, tokenizer: Tokenizer):
    '''
    Mark a sequence of tokens for the exact keyword phrases. 
    If the keyword phrase as a whole is not in the sequence, it will not be marked.
    
    params:
        keyword_phrases: a list of keyword phrases, each phrase is a list of tokens
    return:
        a list of binary labels of the same length as input sequence, 1 for keyword phrases, 0 for others
    '''
    print("phrases", keyword_phrases)
    print(">>>")
    binary_labels = [0] * max_len

    # for phrase in keyword_phrases:
    #     if phrase == []:
    #         continue
    #     print("phrase", phrase)
    #     # convert phrase tokens to sequence tokens
    #     phrase_tokens = tokenizer.texts_to_sequences(phrase) # [0] because texts_to_sequences returns a list of lists
    #     print("phrase tokens", phrase_tokens)
    #     print()
    #     # see if the phrase is in the sequence
    #     for i in range(len(sequence) - len(phrase_tokens) + 1):
    #         if sequence[i:i+len(phrase_tokens)] == phrase_tokens: # matching the whole phrase
    #             binary_labels[i:i+len(phrase_tokens)] = [1] * len(phrase_tokens)

    phrase_tokens = tokenizer.texts_to_sequences(keyword_phrases)
    print("phrase tokens", phrase_tokens)

    return binary_labels

def keywords_marking(keywords: list, sequences: list, max_len: int, tokenizer: Tokenizer):
    binary_labels = []
    for i in range(len(keywords)):
        binary_labels.append(keywords_marking_exact(keywords[i], sequences[i], max_len, tokenizer))
    return binary_labels

In [22]:
print(sequences)

[[1, 76, 51, 138, 139, 77, 78, 19, 29, 140, 79, 22, 80, 36, 141, 4, 142, 16, 5, 1, 52, 4, 77, 78, 19, 81, 7, 143, 144, 82, 5, 10, 145, 2, 9, 19, 146, 83, 1, 147, 4, 1, 82, 148, 2, 23, 9, 19, 149, 24, 150, 151, 53, 30, 1, 52, 5], [53, 30, 1, 84, 152, 153, 3, 54, 85, 11, 25, 12, 3, 154, 4, 155, 156, 6, 86, 2, 24, 6, 24, 157, 37, 158, 26, 159, 7, 1, 17, 3, 55, 87, 25, 2, 13, 160, 38, 1, 88, 4, 161, 6, 162, 163, 3, 89, 11, 164, 85, 165, 12, 5, 9, 166, 14, 167, 15, 1, 168, 90, 7, 169, 83, 36, 170, 86, 30, 171, 172, 3, 31, 7, 173, 1, 17, 3, 20, 25, 3, 174, 6, 20, 91, 25, 3, 92, 5, 175, 2, 9, 39, 1, 88, 4, 8, 20, 25, 32, 3, 89, 3, 31, 7, 93, 1, 176, 3, 20, 6, 91, 25, 5, 3, 40, 177, 51, 2, 9, 76, 1, 17, 178, 9, 56, 93, 3, 1, 57, 32, 2, 41, 179, 180, 4, 1, 57, 32, 2, 6, 1, 181, 4, 42, 33, 9, 39, 10, 1, 32, 5, 9, 182, 30, 1, 58, 4, 8, 94, 20, 59, 42, 27, 183, 4, 60, 184, 2, 95, 2, 20, 59, 42, 6, 20, 59, 185, 42, 5], [96, 1, 97, 60, 186, 2, 8, 187, 188, 4, 17, 3, 189, 190, 191, 11, 43, 12, 37, 26

In [23]:
# keywords = util.tokenize_sentence(df.head(5)['Keywords'])
print(len(df.head(5)['Keywords']))

train_y = keywords_marking(keywords, sequences, 350, tokenizer)
print(train_y[0])
# 

5
phrases [['code', 'clone'], ['reengineering', 'for', 'libraries']]
>>>
phrase tokens [[16, 77], [10, 151]]
phrases [['chemical', 'information', 'retrieval'], ['chemistry'], ['patent', 'retrieval'], ['prior', 'art', 'search']]
>>>
phrase tokens [[20, 54, 85], [171], [91, 85], [42]]
phrases [['ambulatory', 'assessment'], ['borderline', 'personality', 'disorder'], ['clinical', 'psychology'], ['everyday', 'life'], ['interactive', 'feedback'], ['real-time', 'data', 'interpretation'], ['unobtrusive', 'objective', 'assessment']]
>>>
phrase tokens [[44, 21], [189, 190, 191], [284], [201, 202], [222, 107], [213, 98], [215, 102, 21]]
phrases [['refactoring'], ['software', 'evolution'], ['vector-based', 'representation']]
>>>
phrase tokens [[18], [144], []]
phrases [['information', 'context'], ['productivity'], ['task'], ['task', 'modeling']]
>>>
phrase tokens [[54, 403], [74], [27], [27]]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [31]:
print(len(sequences))
print(df.head(5))

5
        id                                              Title  \
0  1808918  Toward identifying inter-project clone sets fo...   
1  1458577  A proposal for chemical information retrieval ...   
2  2307865  What does psychology and psychiatry need from ...   
3  2610394  Scalable detection of missed cross-function re...   
4  1838054  Exploiting information context to improve prod...   

                  Subtitle                                           Abstract  \
0                      NaN  The present paper discusses how clone sets can...   
1                      NaN  Based on the important progresses made in info...   
2  an end-user perspective  Over the past two decades, a growing body of r...   
3                      NaN   Refactoring is an important way to improve th...   
4                      NaN  Productivity is the main concern of today's IT...   

                                            Keywords  
0      ['code clone', 'reengineering for libraries']  
1  ['chemi

In [29]:
print(sequences)
print(tokenizer.word_index)
# find the max length of the sequences
max_length = max([len(seq) for seq in sequences])
print(max_length)
# find index of the word 
print(tokenizer.word_index['present'])
# find the word at index 51
print(tokenizer.index_word[51])

[[1, 76, 51, 138, 139, 77, 78, 19, 29, 140, 79, 22, 80, 36, 141, 4, 142, 16, 5, 1, 52, 4, 77, 78, 19, 81, 7, 143, 144, 82, 5, 10, 145, 2, 9, 19, 146, 83, 1, 147, 4, 1, 82, 148, 2, 23, 9, 19, 149, 24, 150, 151, 53, 30, 1, 52, 5], [53, 30, 1, 84, 152, 153, 3, 54, 85, 11, 25, 12, 3, 154, 4, 155, 156, 6, 86, 2, 24, 6, 24, 157, 37, 158, 26, 159, 7, 1, 17, 3, 55, 87, 25, 2, 13, 160, 38, 1, 88, 4, 161, 6, 162, 163, 3, 89, 11, 164, 85, 165, 12, 5, 9, 166, 14, 167, 15, 1, 168, 90, 7, 169, 83, 36, 170, 86, 30, 171, 172, 3, 31, 7, 173, 1, 17, 3, 20, 25, 3, 174, 6, 20, 91, 25, 3, 92, 5, 175, 2, 9, 39, 1, 88, 4, 8, 20, 25, 32, 3, 89, 3, 31, 7, 93, 1, 176, 3, 20, 6, 91, 25, 5, 3, 40, 177, 51, 2, 9, 76, 1, 17, 178, 9, 56, 93, 3, 1, 57, 32, 2, 41, 179, 180, 4, 1, 57, 32, 2, 6, 1, 181, 4, 42, 33, 9, 39, 10, 1, 32, 5, 9, 182, 30, 1, 58, 4, 8, 94, 20, 59, 42, 27, 183, 4, 60, 184, 2, 95, 2, 20, 59, 42, 6, 20, 59, 185, 42, 5], [96, 1, 97, 60, 186, 2, 8, 187, 188, 4, 17, 3, 189, 190, 191, 11, 43, 12, 37, 26