In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import util as util

EMBEDDING_DIM = 50


# TRAIN_PATH = '../dataset/SciHTC/train_title_abstract_keywords.csv'
# TEST_PATH = '../dataset/SciHTC/test_title_abstract_keywords.csv'

# # read the first 3 rows from a csv file into a dataframe
# df = pd.read_csv(TRAIN_PATH, nrows=3)

# # get the column names
# columns = ['Title', 'Abstract']

# # Concatenate two columns using the apply function
# # df['Concatenated'] = df[columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1 )
# df['input_tokens'] = df[columns].agg(' '.join, axis=1)

# # preprocess and tokenize the combined column
# df['input_tokens'] = df['input_tokens'].apply(lambda row: util.clean_text(row))

# for index, row in df.iterrows():
#     print(row['Title'])
#     print(row['Abstract'])
#     print(row['Keywords'])
#     # print(row['Concatenated'])
#     print('------------------')

# print(type(df['input_tokens']))

# # convert the input_tokens column to a numpy array
# input_tokens = df['input_tokens'].to_numpy()
# print(type(input_tokens))
# print(input_tokens.shape)
# print(df['input_tokens'][0])


[nltk_data] Downloading package punkt to /Users/juneechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juneechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# get word embeddings from the gensim package
import gensim.downloader

# download the glove embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [3]:
MAX_LEN = 10

In [4]:
# create a test dataframe of 3 rows with 'Title', 'Abstract', 'Keywords' columns
titles = ['first sample', 'second sample', 'third sample']
abstracts = ['He likes dogs. So do I.', 'Hi, how are you? I\'m fine', 'Cactus is a plant. It is green. It has thorns.']
keywords = ['likes dog, cat', 'hi, bye', 'cactus, plant']
test_df = pd.DataFrame({'Title': titles, 'Abstract': abstracts, 'Keywords': keywords})

util.preprocess_data(test_df, ['Title', 'Abstract'], 'Keywords')

# set up tokenizer with columns from the dataframe
tokenizer = util.setup_tokenizer(test_df, ['input_tokens', 'clean_kp'])

# get the embeddings matrix
embeddings_matrix = util.get_embeddings_matrix(tokenizer, glove_vectors, EMBEDDING_DIM)

print("embeddings_matrix shape:", embeddings_matrix.shape)
print(embeddings_matrix[0])

# create the input array
X, Y = util.create_input_array(test_df, 'input_tokens', 'clean_kp', embeddings_matrix, tokenizer, EMBEDDING_DIM, MAX_LEN)

embeddings_matrix shape: (17, 50)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [5]:
print(X.shape)
print(X[0])
print(Y.shape)
print(Y[0])

(2, 10, 50)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [6]:
# print tokenizer vocab size
print("tokenizer vocab size:", len(tokenizer.word_index))
print(tokenizer.word_index)

tokenizer vocab size: 17
{'sample': 1, 'hi': 2, 'cactus': 3, 'plant': 4, 'first': 5, 'like': 6, 'dog': 7, 'second': 8, 'im': 9, 'fine': 10, 'third': 11, 'green': 12, 'thorn': 13, 'like dog': 14, 'cat': 15, 'bye': 16, '<PAD>': 0}


In [7]:
print(test_df)

           Title                                        Abstract  \
0   first sample                         He likes dogs. So do I.   
1  second sample                       Hi, how are you? I'm fine   
2   third sample  Cactus is a plant. It is green. It has thorns.   

         Keywords                                  input_tokens  \
0  likes dog, cat                    [first, sample, like, dog]   
1         hi, bye                [second, sample, hi, im, fine]   
2   cactus, plant  [third, sample, cactus, plant, green, thorn]   

          clean_kp  
0  [like dog, cat]  
1        [hi, bye]  
2  [cactus, plant]  
