In [1]:
import os
from os.path import join
import pickle as pkl

import torch

# Load processed data

In [2]:
def pkl_load(path):
    with open(path,'rb') as f:
        return pkl.load(f)

All processed data are stored under the folder "processed_data"

In [3]:
parent_directory = os.path.dirname(os.getcwd())
processed_data_path = join(parent_directory,"processed_data")

# load the vocabulary
vocab_path = join(processed_data_path,"remove-stopwords-punct-25000.vocab")
vocab = torch.load(vocab_path)


# load training, validation and test sets
train_path = join(processed_data_path,"train.pickle")
val_path = join(processed_data_path,"val.pickle")
test_path = join(processed_data_path,"test.pickle")

X_train,y_train = pkl_load(train_path)
X_val,y_val = pkl_load(val_path)
X_test,y_test = pkl_load(test_path)


# load embedding matrix
word2vec_path = join(processed_data_path,"word2vec.pickle")
glove_path = join(processed_data_path,"glove.pickle")

embed_matrix_word2vec = pkl_load(word2vec_path)
embed_matrix_glove = pkl_load(glove_path)

The X's are nested lists in the form of list[list[int]]. <br>
The inner lists contain tokenized texts, with each token represented by its index in the vocabulary. 

In [4]:
X_train[1][:5]

[11, 52, 1450, 54, 910]

In [5]:
vocab.lookup_tokens(X_train[1][:5])

['new', 'law', 'proposed', 'that', 'will']

The y's are lists of labels.

In [6]:
y_train[:5]

[1, 2, 3, 1, 3]

# Get pre-trained embeddings

The embedding matrixs are in the shape of (len(vocab),embed_dim). Row i represents the i-th token's pre-trained embedding.

In [7]:
print(embed_matrix_word2vec.shape)
embed_matrix_word2vec

(25002, 300)


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.25727392e-04, -1.01302849e-03, -1.04770563e-02, ...,
        -2.76348446e-02,  6.10916867e-02,  3.80460705e-02],
       [ 8.00781250e-02,  1.04980469e-01,  4.98046875e-02, ...,
         3.66210938e-03,  4.76074219e-02, -6.88476562e-02],
       ...,
       [ 3.03955078e-02,  3.32031250e-01,  5.27954102e-03, ...,
        -3.18359375e-01,  3.04687500e-01,  1.67968750e-01],
       [ 6.87500000e-01, -1.20605469e-01,  1.08642578e-02, ...,
        -2.20703125e-01,  2.85156250e-01, -1.57226562e-01],
       [-7.71484375e-02,  2.59765625e-01, -8.49609375e-02, ...,
        -7.71484375e-02, -1.57226562e-01,  1.22558594e-01]])

Row 0 is the embedding vector for \<PAD>. I use zero vector to represent it since it does not have any meaning.<br>
Row 1 is the embedding vector for \<UNK>. I use the mean of all pre-trained vectors to represent it since it could be any word.

In [8]:
embed_matrix_word2vec[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.