# Entity Detection Printout

In [None]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd '/content/drive/My Drive/BuboQA/entity_detection/nn'
!pwd

/content/drive/My Drive/BuboQA/entity_detection/nn
/content/drive/My Drive/BuboQA/entity_detection/nn


In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import os
import numpy as np
from torchtext import data
from sq_entity_dataset import SQdataset
from entity_detection import EntityDetection
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
batch_size = 32

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TEXT = data.Field(lower=True)
ED = data.Field()
train, dev, test = SQdataset.splits(TEXT, ED, '../../data/processed_simplequestions_dataset')
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev, test)

In [None]:
print(ED.vocab.itos)

['<unk>', '<pad>', 'O', 'I', "'O',", "'O']", "['O',"]


In [None]:
print(train[0].__dict__.keys())
print(train[0].text, train[0].ed)

dict_keys(['text', 'ed'])
['what', 'is', 'the', 'book', 'e', 'about'] ['O', 'O', 'O', 'O', 'I', 'O']


In [None]:
train_iter = data.Iterator(train, batch_size=batch_size, device=device, train=True, repeat=False,
                           sort=False, shuffle=True, sort_within_batch=False)

In [None]:
print(next(iter(train_iter)).text)    # size is (seq_length, batch_size)
print(next(iter(train_iter)).ed)

tensor([[   16,     2,     2,    11,    10,     2,    11,    11,     2,     2,
             2,     2,    16,     2,     2,    10,     2,     2,     2,    11,
             2,    14,     2,    16,     2,     2,     2,    16,     2,     2,
             2,     2],
        [   25,    77,     3,    71,   271,   236,     3,    12,    84,    17,
            23,    48,     4,     3,     3,   325,    33,    20,    42,     3,
            42,     9,     3,    25,     3,     8,   351,    25,     3,     3,
             3,    29],
        [   30,   102,     4,     4,    95,   103,     8,     8,   126,  2237,
             3,     9,   414,     4,     8,   201,   120,     5,     3,     4,
             3, 33147,     4,    30,  1055,   135,     5,   614,     8,     8,
             8,     5],
        [   27,     3,    74,    42,   111,   426,   108,   158,     3, 36064,
           841,    95,     5,   167,   334,    32,     4,   812,    18,   129,
            18, 45021,    16,    11, 25519,     7,    15,  

In [None]:
print(next(iter(train_iter)).text.shape)    # each batch has different length, (seq_length, batch_size)

torch.Size([14, 32])


In [None]:
print(TEXT.vocab.itos[1])    # Iterator can automatically fill in some padding to make the sentences have same length

<pad>


In [None]:
match_embedding = 0
stoi, vectors, dim = torch.load("../../data/sq_glove300d.pt")
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
for i, token in enumerate(TEXT.vocab.itos):
    wv_index = stoi.get(token, None)
    if wv_index is not None:
        TEXT.vocab.vectors[i] = vectors[wv_index]
        match_embedding += 1
    else:
        TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)

In [None]:
print(TEXT.vocab.vectors[1])

tensor([ 0.0426, -0.2213, -0.2220, -0.2001,  0.0584, -0.1067, -0.1727,  0.1268,
         0.2000,  0.1579,  0.2372,  0.2498,  0.2490,  0.1709,  0.1333,  0.2330,
         0.1084, -0.1157,  0.1470, -0.0827, -0.1413,  0.2493,  0.0308,  0.1164,
         0.2342,  0.0146,  0.0522, -0.2376, -0.1731,  0.2024,  0.1887,  0.1307,
         0.1119,  0.2215, -0.0367,  0.1972, -0.1527, -0.0288, -0.0703, -0.2170,
         0.0727,  0.0417,  0.0939, -0.0116, -0.1355,  0.2096,  0.1424, -0.1125,
         0.0218, -0.2323,  0.0130,  0.0049, -0.0021,  0.0152, -0.1243,  0.1537,
        -0.1372,  0.1097,  0.0812,  0.1921, -0.2278,  0.0635,  0.2027, -0.0200,
        -0.0269,  0.2299, -0.1547, -0.1350, -0.2070, -0.0787, -0.0911,  0.1904,
        -0.2022, -0.0587,  0.2461, -0.0384, -0.0026, -0.0959, -0.0634,  0.1953,
         0.2323, -0.0278, -0.1485,  0.2023, -0.1282,  0.0521,  0.1043,  0.1681,
         0.0394, -0.1613, -0.0189, -0.2188,  0.2456,  0.1920, -0.0028,  0.1452,
        -0.1370, -0.2309,  0.1053, -0.09

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, words_num, target_size, d_model=300, nhead=8, num_encoder_layers=6,
                 dim_feedforward=2048, transformer_dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(words_num, d_model)
        # self.src_embed = nn.Embedding(config.words_num, config.words_dim)
        # self.tgt_embed = nn.Embedding()
        # if config.train_embed == False:
        #     self.embed.weight.requires_grad = False
        # self.src_mask = None
        self.pos_encoder = PositionalEncoding(d_model, transformer_dropout)
        encoderlayer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, transformer_dropout)
        encoder_norm = nn.LayerNorm(d_model)
        self.transformer_encoder = TransformerEncoder(encoderlayer, num_encoder_layers, encoder_norm)
        self.decoder = nn.Linear(d_model, target_size)
        self.init_weights()

    def _generate_square_subsequent_mask(self, seq_len):
        mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src):
        # if self.src_mask is None or self.src_mask.size(0) != src.size(0):
        #     device = src.device
        #     mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
        #     self.src_mask = mask
        
        src = self.embed(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src) # not use src_mask
        output = self.decoder(output)
        scores = F.log_softmax(output.view(-1, output.size(2)), dim=1)
        return scores

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
words_num = len(TEXT.vocab)
label = len(ED.vocab)
model = TransformerModel(words_num=words_num, target_size=label, nhead=6)
model.embed.weight.data.copy_(TEXT.vocab.vectors)
# model.cuda()

tensor([[ 0.0486,  0.2480,  0.0440,  ..., -0.0969,  0.1431,  0.1702],
        [ 0.0426, -0.2213, -0.2220,  ...,  0.2227, -0.1773, -0.0225],
        [-0.0385,  0.5425, -0.2184,  ...,  0.1180,  0.2459,  0.2287],
        ...,
        [-0.1808,  0.0146,  0.1793,  ...,  0.2361,  0.2088, -0.1002],
        [ 0.1748,  0.1168, -0.0339,  ..., -0.0711, -0.0333,  0.0273],
        [ 0.0680, -0.2074, -0.2466,  ..., -0.1844, -0.1593,  0.0322]])

In [None]:
batch = next(iter(train_iter))
scores = model(batch.text)

In [None]:
print(scores.shape)
print(batch.ed.shape)

torch.Size([352, 7])
torch.Size([11, 32])


In [None]:
torch.max(scores,1) # have values and indices
batch_comp = torch.sum(torch.max(scores,1)[1].view(batch.ed.size()).data == batch.ed.data, dim=0)
n_correct = torch.sum(batch_comp == batch.ed.size()[0]).item()
print(batch_comp)

tensor([6, 4, 4, 5, 4, 5, 4, 8, 4, 4, 2, 4, 7, 3, 3, 2, 5, 3, 1, 5, 3, 5, 4, 5,
        4, 3, 4, 4, 2, 2, 3, 4])


In [None]:
## in top_retrieval.py
index2tag = np.array(ED.vocab.itos)
index2word = np.array(TEXT.vocab.itos)

index_tag = np.transpose(torch.max(scores, 1)[1].view(batch.ed.size()).cpu().data.numpy())
tag_array = index2tag[index_tag]
print(tag_array)
print(index_tag.shape)
print(tag_array.shape)

[['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' "['O'," '<pad>' 'O']
 ["'O'," 'O' 'O' '<pad>' '<pad>' "'O'," 'O' '<pad>' 'O' 'O' 'O']
 ['O' '<pad>' '<pad>' 'O' '<pad>' '<pad>' 'O' '<pad>' '<pad>' 'O' '<pad>']
 ["'O'," 'O' 'O' 'O' 'O' 'O' 'O' 'O' "'O'," 'O' "'O',"]
 ['O' 'O' '<unk>' 'O' "'O'," 'O' 'O' 'O' 'O' 'O' 'O']
 ['O' 'O' "'O'," "'O'," 'O' 'O' 'O' 'O' 'O' 'O' 'O']
 ['O' 'O' '<pad>' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
 ['O' 'O' 'O' 'O' 'O' 'O' 'O' '<pad>' 'O' 'O' 'O']
 ['<pad>' 'O' 'O' 'O' 'O' "'O'," "'O'," "'O'," '<pad>' "'O'," "'O',"]
 ["'O'," 'O' '<unk>' 'O' 'O' '<unk>' 'O' "'O'," 'O' "'O'," '<pad>']
 ['O' 'O' 'O' 'O' '<pad>' 'O' 'O' 'O' 'O' 'O' 'O']
 ["'O'," 'O' 'O' '<pad>' "'O'," '<pad>' "'O'," '<pad>' '<unk>' '<pad>'
  'O']
 ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
 ['O' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' "'O'," '<pad>' "'O',"
  "'O'," '<pad>']
 ['O' 'O' 'O' 'O' '<pad>' 'O' '<pad>' 'O' 'O' 'O' 'O']
 ['<pad>' '<pad>' "'O'," '<pad>' '<pad>' 'O' "'O'," "'O'," '<pad>' 'O'
  'O']
 ['O' '

In [None]:
index_question = np.transpose(batch.text.cpu().data.numpy())
question_array = index2word[index_question]
print(index_question[0:5])
print(question_array[0:5])

[[    2    23     3     4 23047  6789 24900  2426     7     1     1]
 [    2    19     3   384   180  1689     6     1     1     1     1]
 [    2     3     4   286   310     5 15942 13537  3531 14575     1]
 [    2    34     3    22     4    30  4084 47535     6     1     1]
 [   11     3     8  8691  4371   505    75    30     1     1     1]]
[['what' 'country' 'is' 'the' 'abyad' 'wa' 'aswad' 'distributed' 'in'
  '<pad>' '<pad>']
 ['what' 'genre' 'is' 'get' 'it' 'together' '?' '<pad>' '<pad>' '<pad>'
  '<pad>']
 ['what' 'is' 'the' 'legal' 'status' 'of' 'grafco' 'applicator' '45' 'cm'
  '<pad>']
 ['what' 'track' 'is' 'by' 'the' 'artist' 'arturo' 'ofarrill' '?' '<pad>'
  '<pad>']
 ['who' 'is' 'a' 'grammy' 'winning' 'punk' 'rock' 'artist' '<pad>'
  '<pad>' '<pad>']]


In [None]:
print(TEXT.vocab.itos[2])

what
