In [1]:
import torch
from torch.autograd import Variable

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

%load_ext autoreload
%autoreload 2

In [4]:
import functions as fn
import torch_utils as utils


In [2]:
df = pd.read_csv('clean_data/clean_data.csv', index_col=0)
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1.0,Thousands,NNS,O
1,1.0,of,IN,O
2,1.0,demonstrators,NNS,O
3,1.0,have,VBP,O
4,1.0,marched,VBN,O


In [3]:
for r in df.values[:10]:
    print(r[1])

Thousands
of
demonstrators
have
marched
through
London
to
protest
the


In [4]:
# we'll first turn our dataframe to a list of lists of tuples, since this is the infinitely
# more convenient data structure for neural nets and torch

df.Word.to_list()

def df_to_torch_list(df):
    """Function takes in dataframe with four columns:
    Sentence #; Word; POS; Tag.
    -------------------------------------------------
    Returns: 
    - input_data as a list of lists (each a sentence) of tuples
    where each tuple is (word; POS)
    - target_data - list of lists (each a sentence) of Named 
    Entity Tags (e.g. 'O', 'B-geo', 'I-art', etc)
    """
    
    input_data = []
    target_data = []
    data = df.copy()
    for sent_ind in range(1,len(data['Sentence #'].unique().astype(int))):
        sent_df = data.loc[data['Sentence #'] == sent_ind]
        sent_lst = []
        sent_target_lst = []
        for row in sent_df.values:
            sent_lst.append((row[1], row[2]))
            sent_target_lst.append(row[3])
        input_data.append(sent_lst)
        target_data.append(sent_target_lst)
    return input_data, target_data
        
        

In [5]:
%time
input_data, target_data = utils.df_to_torch_list(df)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.11 µs


In [6]:
input_data[1]
# target_data[0]

[('Families', 'NNS'),
 ('of', 'IN'),
 ('soldiers', 'NNS'),
 ('killed', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('conflict', 'NN'),
 ('joined', 'VBD'),
 ('the', 'DT'),
 ('protesters', 'NNS'),
 ('who', 'WP'),
 ('carried', 'VBD'),
 ('banners', 'NNS'),
 ('with', 'IN'),
 ('such', 'JJ'),
 ('slogans', 'NNS'),
 ('as', 'IN'),
 ('"', '.'),
 ('Bush', 'NNP'),
 ('Number', 'NN'),
 ('One', 'CD'),
 ('Terrorist', 'NN'),
 ('"', '.'),
 ('and', 'CC'),
 ('"', '.'),
 ('Stop', 'VB'),
 ('the', 'DT'),
 ('Bombings', 'NNS'),
 ('.', '.'),
 ('"', '.')]

In [7]:
len(target_data)

2998

In [8]:
len(input_data)

2998

In [9]:
len(input_data[2548])

70

We know what lenght of our longest sentence is. This is important since our neural net will require all inputs to be of equal length and we'll pad shorter sentences to length. 

## Train test split

Before any preprocessing occurs we will split our data into training, validation and test datasets. This might seem strange to do before creating the vocabulary, but this way, when we do get to validation and testing stage, we'll be able to see just how much 'unknown' words will impact on the model's performance. 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_sents, test_sents, train_labels, test_labels = train_test_split(input_data, target_data, test_size=.2, shuffle=False)
train_sents, valid_sents, train_labels, valid_labels = train_test_split(train_sents, train_labels, test_size=.15, shuffle=False)

### Preprocessing into OHE
Workflow for data preprocessing for pytorch (based on the following [resource posted by Andrew Ng and Stanford University's CS Department](https://cs230.stanford.edu/blog/namedentity/)). 

1. Create a unique vocabulary dict
2. Create an NE tags dict
3. Turning text data into lists of ints
4. Using a batch generator to turn lists into Torch Tensors
5. Specifying a lookup table for turning tensors to embedded arrays. 

In [44]:
def generate_int_vocab(input_data : list):
    """Function takes in list (corpus) of lists (sentences) of dicts (words) of tuples (word, POS) and returns
    a single vocabulary dict.
    Returns:
    vocab_dict - (dict) word - unique integer pairs."""
    vocab_dict = {}
    i=1
    for sentence in input_data:
        for word_pos_tuple in sentence:
            if word_pos_tuple not in vocab_dict.keys():
                vocab_dict[word_pos_tuple[0]] = i
                i +=1
                continue
            else: 
                continue
        vocab_dict['UNK'] = i+1
        vocab_dict['PAD'] = 0
    return vocab_dict

In [12]:
vocab_test = utils.generate_int_vocab(train_sents[:5])
vocab_test

{'Thousands': 1,
 'of': 106,
 'demonstrators': 3,
 'have': 4,
 'marched': 56,
 'through': 6,
 'London': 7,
 'to': 62,
 'protest': 85,
 'the': 101,
 'war': 11,
 'in': 100,
 'Iraq': 13,
 'and': 48,
 'demand': 15,
 'withdrawal': 17,
 'British': 19,
 'troops': 20,
 'from': 57,
 'that': 22,
 'country': 23,
 '.': 108,
 'UNK': 110,
 'PAD': 0,
 'Families': 25,
 'soldiers': 27,
 'killed': 28,
 'conflict': 31,
 'joined': 32,
 'protesters': 34,
 'who': 35,
 'carried': 36,
 'banners': 37,
 'with': 38,
 'such': 39,
 'slogans': 40,
 'as': 41,
 '"': 54,
 'Bush': 43,
 'Number': 44,
 'One': 45,
 'Terrorist': 46,
 'Stop': 50,
 'Bombings': 52,
 'They': 55,
 'Houses': 59,
 'Parliament': 61,
 'a': 63,
 'rally': 64,
 'Hyde': 66,
 'Park': 67,
 'Police': 69,
 'put': 70,
 'number': 72,
 'marchers': 74,
 'at': 75,
 '10,000': 76,
 'while': 77,
 'organizers': 78,
 'claimed': 79,
 'it': 80,
 'was': 81,
 '1,00,000': 82,
 'The': 84,
 'comes': 86,
 'on': 87,
 'eve': 89,
 'annual': 92,
 'conference': 93,
 'Britain': 9

In [13]:
%time
vocabulary_dict = utils.generate_int_vocab(train_sents)
len(vocabulary_dict)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


7117

Having previously run the vocab generator on the entire dataset, the number of unique terms was 8766. Now reduced to 7115 we can gauge that at least 1.5k words will be be assigned the unknown tag across the validation and test set. 

In [46]:
%time

def generate_tag_dict(targets_list):
    """Function takes in a list of NE tags (which should include at least one instance of 
    every possible NE tag) and returns a dict matching each NE tag to a unique int.
    Returns:
    tag_map - (dict) of NE tag - associated int pairs"""
    ne_dict = {}
    i = 0
    for sublist in targets_list:
        for ne in sublist:
            if ne in ne_dict.keys():
                continue
            else:
                ne_dict[ne] = i
                i += 1
    return ne_dict

In [15]:
ne_dict = utils.generate_tag_dict(train_labels)
len(ne_dict)

17

In [47]:
def sent_to_ints(feature_list : list, targets_list : list, vocab : dict, ne_dict : dict, incl_POS = False, POS_dict = None):
    """Function takes in list (corpus) of lists (sentences) of dicts (words) of tuples (word, POS) and returns
    a list (corpus) of lists (sentences) of integers (representing words).
    Returns:
    list_data."""
    int_sentences = []        
    int_label_sentences = []
    
    for sentence in feature_list:     
        #replace each token by its index if it is in vocab
        #else use index of UNK
        sentence_ints = [vocab[token[0].lower()] if token[0].lower() in vocab.keys() 
             else vocab['UNK']
             for token in sentence]
        int_sentences.append(sentence_ints)
        
    for sentence in targets_list:
        #replace each label by its index
        label_sent = [ne_dict[label] for label in sentence]
        int_label_sentences.append(label_sent) 
        
        
    if incl_POS:
        int_sentences_POS = []
        for sentence in feature_list:     
        #replace each token by its index if it is in vocab
        #else use index of UNK
            sentence_POS_int = [vocab[token[1]] if token[1] in POS_dict.keys() 
                 else POS_dict['UNK_POS']
                 for token in sentence]
            int_sentences_POS.append(sentence_POS_int)
        return int_sentences, int_sentences_POS, int_label_sentences
        
    else:     
        return int_sentences, int_label_sentences

In [16]:
%time
# for now, we will only be generating the feature data w/out the POS tags
train_int_sentences, train_int_label_sentences = utils.sent_to_ints(train_sents, train_labels, vocabulary_dict, ne_dict)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


And now, a quick visual inspection to make sure everything has worked according to plan. 

In [17]:
train_sents[0][:4]

[('Thousands', 'NNS'), ('of', 'IN'), ('demonstrators', 'NNS'), ('have', 'VBP')]

In [18]:
print(vocabulary_dict['Thousands'])
print(vocabulary_dict['of'])
print(vocabulary_dict['demonstrators'])

44346
45240
44310


In [19]:
train_int_sentences[0][:4]

[43858, 45240, 44310, 45232]

### Batch generation
We'll be defining a new object class of batch generators which will section our data and turn several sentences at a time to Torch Tensor objects. 

In [20]:
import preprocessing_for_torch as prep

In [115]:
class DataGenerator():
    
    def __init__(self, vocab, ne_dict):
        self.vocab = vocab
        self.ne_dict = ne_dict
        self.vocab_size = len(vocab)
        

    def create_train_data_loader(self, batch_sentences : list, batch_sentences_labels : list, word_vect_dim = 50):
        """Function takes in a list of lists (each sublist a sentence of n-dimension
        numpy arrays), the associated list of lists of NE labels and a vocabulary (dict)"""
        vocab = self.vocab
        
        #compute length of longest sentence in batch
        batch_max_len = max([len(sentence) for sentence in batch_sentences_labels])
        self.batch_max_len = batch_max_len
        
        #prepare a numpy array with the data, initializing the data with 'PAD' 
        #and all labels with -1; initializing labels to -1 differentiates tokens 
        #with tags from 'PAD' tokens
        #note the dimensional change here as we are effectively about to 
        # concatenate the sentences along the 2nd dimension
        batch_data = vocab['PAD']*np.ones((len(batch_sentences), batch_max_len))
        batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))
        
        #copy the data to the numpy array
        for j in range(len(batch_sentences)):
            cur_len = len(batch_sentences[j])
            batch_data[j][:cur_len] = batch_sentences[j]
            batch_labels[j][:cur_len] = batch_sentences_labels[j]

        #since all data are indices, we convert them to torch LongTensors
#         batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)
        
        # Turn the input pandas dataframe into tensors
        batch_data = Variable(torch.from_numpy(batch_data).float().squeeze())
        batch_labels = Variable(torch.from_numpy(batch_labels).long())

        # Build the dataset
        train_sample_ds = torch.utils.data.TensorDataset(batch_data, batch_labels)
        # Build the dataloader
        train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

        #convert Tensors to Variables
#         batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
#         yield batch_data, batch_labels
        return train_sample_dl
    

In [116]:
data_gen = DataGenerator(vocabulary_dict, ne_dict)

And now, a quick visual inspection will show us that our sentences and associated labels have been transformed into equal length tensors, where 0 refers to padding and -1 is the corresponding output tag for the padding. This labelling is important since the model needs to know to distinguish non-NE words (tag : 0) from the artificial padding (tag : -1). 

In [117]:
next(data_gen.prep_batch(train_int_sentences[0:3], train_int_label_sentences[0:3],))

TypeError: 'DataLoader' object is not an iterator

In [118]:
data_loader = (data_gen.prep_batch(train_int_sentences[0:3], train_int_label_sentences[0:3],))

In [127]:
for batch in data_loader:
#     print(batch[0])
    print(batch[1])

tensor([[ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
          2,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  0, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]])


In [28]:
%time
data_batch_generator = data_gen.prep_batch(train_int_sentences[:10], train_int_label_sentences[:10],)
train_batch_1_sent, train_batch_1_labels = next(data_batch_generator)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


In [29]:
train_batch_1_labels

tensor([[ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
          2,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  0, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  5,  6,  0,  0,
          0,  2,  0,  0,  0,  1,  0, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
          2,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1],
        [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,
          0,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  5,  6,  6,  6

## Adding glove embedding vectors

To do:
* create input layer, matching incoming words to their respective glove_dict arrays
* 

In [146]:
def load_glove_vects(file = 'glove/glove.6B.50d.txt', vdim=None):
    """Function that loads the Global representation Vectors
    and returns them as a dictionary. 
    -----------------
    Returns:
    glove_dict - (dict) key - word (str), value - n-dimensional np array """
    glove_dict = {}
#     total_vocab = vocab
    if type(vdim)==int:
        file = f'glove/glove.6B.{vdim}d.txt'
    avg_vect = np.zeros((vdim,))
    with open(file, 'rb') as f:
        for line in f:
            parts = line.split()
            word = parts[0].decode('utf-8')
            vector = np.array(parts[1:], dtype=np.float32)
            glove_dict[word] = vector
            avg_vect += vector
        # creating the vector for new, UNKnown words in the vocabulary
        # NOTE, this is NOT the same as the word "unk", which is 
        # present in glove's vocabulary
        glove_dict['UNK'] = avg_vect/len(glove_dict)
        glove_dict['PAD'] = np.zeros((vdim,))
    return glove_dict
    

In [32]:
%time
glove_dict = prep.load_glove_vects(vdim=50);

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


In [33]:
glove_dict['protesters']

array([-0.3021  , -0.59405 ,  0.68421 , -0.64191 , -0.32535 , -0.86085 ,
        0.012886,  0.14725 , -1.2244  , -0.14167 , -0.51624 , -2.0182  ,
        0.19923 , -0.12294 ,  0.11817 , -0.28932 ,  0.11917 , -0.14441 ,
       -0.40252 , -1.2541  ,  0.50118 ,  0.61998 , -0.19566 , -0.13393 ,
       -0.17067 , -1.3423  ,  1.0381  ,  0.33942 , -0.22235 , -0.60382 ,
        2.3121  ,  0.47781 , -1.1405  , -0.62335 , -1.2281  ,  0.17603 ,
        0.81585 , -1.3752  , -0.65037 ,  1.0504  , -0.36525 ,  0.38238 ,
        0.15291 , -0.11214 ,  1.6126  , -0.47582 , -1.3869  , -0.93827 ,
       -0.097893, -1.8234  ], dtype=float32)

## Defining our neural network

In [156]:
glove_dict['america']

array([-0.13124  ,  0.46555  , -0.10921  ,  0.18759  ,  0.073319 ,
       -0.40072  , -1.1418   , -0.52592  ,  0.20455  ,  0.22532  ,
        0.19891  ,  0.21863  , -0.14053  ,  0.026534 ,  0.35482  ,
       -0.27559  , -0.14433  ,  0.14208  , -0.23811  , -0.0045941,
       -0.14462  , -0.10607  , -0.23974  ,  0.44399  , -0.033788 ,
       -1.774    , -0.97388  , -0.33887  ,  0.29913  , -0.21471  ,
        2.9346   ,  0.47296  , -0.069746 , -0.42937  , -1.0228   ,
       -1.1021   , -1.149    , -0.39353  , -0.46068  , -0.63748  ,
       -0.38899  , -0.50266  ,  0.9211   , -0.40483  , -0.19845  ,
        0.9402   , -0.59246  , -0.33818  , -0.54872  ,  0.41818  ],
      dtype=float32)

In [190]:
a,b = weights_matrix.shape
print(a, b)

7117 50


In [186]:
len(vocabulary_dict)

7117

In [187]:
import torch.nn as nn
import torch.nn.functional as F

matrix_len = len(vocabulary_dict)
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(vocabulary_dict):
    try: 
        weights_matrix[i] = glove_dict[word.lower()]
        words_found += 1
    except KeyError:
        weights_matrix[i] = glove_dict['UNK']

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    weights_matrix = Variable(torch.from_numpy(weights_matrix)).type(torch.LongTensor)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim


class NER_Net(nn.Module):
    def __init__(self, input_dim : int, vocab, embedding_dim : int, lstm_hidden_dim : int, ne_tags : dict, pretrained_weights : dict, is_embed_trainable = False):
        # inherits attributes of the super class `Net` 
        super(NER_Net, self).__init__()
        self.vocab_size = len(vocab)+1
        self.vocab = vocab
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim = lstm_hidden_dim
        self.ne_tags = ne_tags
        self.number_of_tags = len(ne_tags)
        self.pretrained_weights = pretrained_weights
        
        
        #maps each token to an embedding_dim vector
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)

#         self.embedding.load_state_dict(self.pretrained_weights)
#         if is_embed_trainable==False:
#             self.embedding.requires_grad = False
            
        # initialize the word vectors, pretrained_weights is a 
        # numpy array of size (vocab_size, vector_size) and 
        # pretrained_weights[i] retrieves the word vector of
        # i-th word in the vocabulary
#         self.embedding.weight.data.copy_(torch.fromnumpy(self.pretrained_weights))
        
        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(self.embedding_dim, self.lstm_hidden_dim, batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(self.lstm_hidden_dim, self.number_of_tags)
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        try:
            s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim
        except IndexError:
            print ("Shape of input data is:", s.shape)
            print ("Shape expected by this layer is :", self.embedding)
            s = self.embedding(s)

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        #reshape the Variable so that each row contains one token
        s = s.view(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags
    
    def loss_fn(self, outputs, labels):
        #reshape labels to give a flat vector of length batch_size*seq_len
        labels = labels.view(-1)  

        #mask out 'PAD' tokens
        mask = (labels >= 0).float()

        #the number of tokens is the sum of elements in mask
        num_tokens = int(torch.sum(mask).data[0])

        #pick the values corresponding to labels and multiply by mask
        outputs = outputs[range(outputs.shape[0]), labels]*mask

        #cross entropy loss for all non 'PAD' tokens
        return -torch.sum(outputs)/num_tokens
    
    

In [160]:
%time
data_loader = data_gen.prep_batch(train_int_sentences[:1000], train_int_label_sentences[:1000],)
# train_batch_1_sent, train_batch_1_labels = next(data_batch_generator)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


In [188]:
ner_classif = NER_Net(62, vocabulary_dict, 50, 50, ne_dict, glove_dict)

In [189]:

epochs=100

for epoch in range(epochs):
    for batch in data_loader:
        x = batch[0]
        y = batch[1]
        
        x = x.type(torch.LongTensor)
        y = y.type(torch.LongTensor)
        
        #pass through model, perform backpropagation and updates
        output_batch = ner_classif.forward(x)

        loss = ner_classif.loss_fn(output_batch, y)


        optimizer.zero_grad()  # clear previous gradients
        loss.backward()        # compute gradients of all variables wrt loss

        optimizer.step()  

Shape of input data is: torch.Size([50, 62])
Shape expected by this layer is : Embedding(7117, 50)


IndexError: index out of range in self

In [136]:

num_training_steps=100

for _ in range(num_training_steps):
#     batch_sentences, batch_labels = next(train_iterator)

    #pass through model, perform backpropagation and updates
    output_batch = ner_classif.forward(train_batch_1_sent)
    
    loss = ner_classif.loss_fn(output_batch, train_batch_1_labels)
    

    optimizer.zero_grad()  # clear previous gradients
    loss.backward()        # compute gradients of all variables wrt loss

    optimizer.step()    
    


Shape of input data is: torch.Size([1000, 62])
Shape expected by this layer is : Embedding(62, 50, padding_idx=0)


IndexError: index out of range in self

In [312]:
train_vect_sent , train_label_sent = sent_to_vect(train_sents, train_labels, glove_dict, ne_dict)

In [313]:
len(train_label_sent[200]) == len(train_vect_sent[200])

True

In [314]:
glove_dict['PAD']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [357]:
glove_dict['thousands']

array([ 1.1515e+00, -3.9703e-01,  9.7350e-01, -8.3455e-01, -1.4785e-01,
       -4.7469e-01, -9.8629e-01,  4.4072e-01,  1.0985e-01,  7.3914e-03,
       -4.5690e-01, -1.2794e+00,  1.0253e+00, -5.3370e-01,  1.0906e+00,
       -3.6994e-01, -1.7323e-03, -1.2934e-02, -2.0921e-01, -8.0484e-01,
        3.0218e-01,  2.9622e-01,  4.3949e-02, -6.2642e-02, -1.1756e-02,
       -1.2806e+00, -2.3914e-01, -5.0524e-01,  2.8103e-01, -3.1305e-01,
        3.0938e+00,  6.8201e-01, -3.8915e-01, -5.9624e-01, -6.8694e-01,
        7.9195e-01, -1.5878e-01, -7.9453e-01, -2.0664e-01,  4.5275e-01,
       -4.2613e-01,  3.5096e-01,  5.5050e-01,  2.5910e-01,  7.1832e-01,
       -5.3633e-02, -1.0610e+00, -4.6405e-01, -9.2481e-01, -1.6236e+00],
      dtype=float32)

In [358]:
train_batch_1_sent[0][0]

tensor([ 1.1515e+00, -3.9703e-01,  9.7350e-01, -8.3455e-01, -1.4785e-01,
        -4.7469e-01, -9.8629e-01,  4.4072e-01,  1.0985e-01,  7.3914e-03,
        -4.5690e-01, -1.2794e+00,  1.0253e+00, -5.3370e-01,  1.0906e+00,
        -3.6994e-01, -1.7323e-03, -1.2934e-02, -2.0921e-01, -8.0484e-01,
         3.0218e-01,  2.9622e-01,  4.3949e-02, -6.2642e-02, -1.1756e-02,
        -1.2806e+00, -2.3914e-01, -5.0524e-01,  2.8103e-01, -3.1305e-01,
         3.0938e+00,  6.8201e-01, -3.8915e-01, -5.9624e-01, -6.8694e-01,
         7.9195e-01, -1.5878e-01, -7.9453e-01, -2.0664e-01,  4.5275e-01,
        -4.2613e-01,  3.5096e-01,  5.5050e-01,  2.5910e-01,  7.1832e-01,
        -5.3633e-02, -1.0610e+00, -4.6405e-01, -9.2481e-01, -1.6236e+00])

In [359]:
train_batch_1_labels[0]

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0., -1., -1., -1., -1.,
        -1., -1.])