# Relation Prediction Printout

In [1]:
%cd '/content/drive/My Drive/BuboQA/relation_prediction/nn'

/content/drive/My Drive/BuboQA/relation_prediction/nn


In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import os
import numpy as np
from torchtext import data
from sq_relation_dataset import SQdataset
from relation_prediction import RelationPrediction
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [3]:
batch_size = 32

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with_tag = 'cls'
cls_tag = '<cls>'
TEXT = data.Field(lower=True, init_token=cls_tag)
RELATION = data.Field(sequential=False)
train, dev, test = SQdataset.splits(TEXT, RELATION, '../../data/processed_simplequestions_dataset')
TEXT.build_vocab(train, dev, test)
RELATION.build_vocab(train, dev)

In [5]:
print(len(TEXT.vocab))
print(len(RELATION.vocab))

61334
1698


In [6]:
# print(RELATION.vocab.itos)

In [7]:
print(train[0].__dict__.keys())
print(train[0].text, train[0].relation)

dict_keys(['relation', 'text'])
['what', 'is', 'the', 'book', 'e', 'about'] fb:book.written_work.subjects


In [8]:
train_iter = data.Iterator(train, batch_size=batch_size, device=device, train=True, repeat=False,
                           sort=False, shuffle=True, sort_within_batch=False)

In [9]:
# batch = train_iter.init_epoch()
batch = next(iter(train_iter))
print(batch.text)    # size is (seq_length, batch_size)
print(batch.relation)

tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2],
        [   12,     3,     3,     3,    15,     3,     3,    95,     3,    11,
            11,     3,    11,    15,    15,     3,    11,     3,     3,     3,
             3,     3,    15,  9688,     3,     3,     3,     3,     3,     3,
            12,    11],
        [    4,     4,     4,     4,     8,    14,    30,    10,    88,    87,
            14,    10,   127,    10,    10,   105,    20,    13,    20,    32,
            63,     4,    10,     4,    34,    21,    47,    51,    59,    16,
             4,    31],
        [    5, 10765,    40,     5,  1565,    22,     6,  1688,     4,    96,
            10,    57,     4,  7056, 35658,    10,     4,    26,     4,    90,
            20,  2866, 52588,     9,    10,     6,     4,  

In [10]:
print(batch.text.shape)    # each batch has different length, (seq_length, batch_size)
print(batch.relation.shape)

torch.Size([16, 32])
torch.Size([32])


In [11]:
print(TEXT.vocab.itos[1])    # Iterator can automatically fill in some padding to make the sentences have same length

<pad>


In [12]:
print(TEXT.vocab.stoi['<pad>'])

1


In [13]:
print(TEXT.vocab.itos[2])

<cls>


In [14]:
match_embedding = 0
stoi, vectors, dim = torch.load("../../data/sq_glove300d.pt")
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
for i, token in enumerate(TEXT.vocab.itos):
    wv_index = stoi.get(token, None)
    if wv_index is not None:
        TEXT.vocab.vectors[i] = vectors[wv_index]
        match_embedding += 1
    else:
        TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
if with_tag == 'cls':
  cls_idx = TEXT.vocab.stoi[cls_tag]
  TEXT.vocab.vectors[cls_idx] = torch.ones(dim).float() / 10

In [15]:
pad_idx = stoi.get('<pad>',None)
pad_embed = TEXT.vocab.vectors[1]
pad_mean = torch.mean(pad_embed)
pad_var = torch.var(pad_embed)
print(pad_idx, pad_embed, pad_mean, pad_var)  # check the statistics, if match the uniform distribution

None tensor([-0.0818, -0.0355,  0.1636,  0.2031, -0.2350,  0.0071,  0.2113,  0.0619,
        -0.1128,  0.0702, -0.2104,  0.0555, -0.2467,  0.0658, -0.1642, -0.1523,
         0.1738,  0.1379, -0.0552,  0.0916,  0.1958, -0.1674,  0.0053, -0.0755,
        -0.0756,  0.0592,  0.0143,  0.1777,  0.2478,  0.0799, -0.1392, -0.2003,
         0.2317, -0.1138,  0.2494,  0.1531, -0.1733,  0.2257, -0.0599,  0.1640,
         0.0041, -0.1554, -0.2044,  0.1649,  0.0637, -0.2348,  0.0362, -0.0677,
        -0.1688,  0.0644,  0.1545,  0.0130,  0.0924, -0.1434,  0.0752,  0.2487,
         0.0255,  0.1433, -0.0459, -0.0839,  0.0057, -0.1296,  0.2484, -0.1878,
        -0.0149, -0.0290, -0.0464, -0.1129, -0.2374,  0.0272, -0.0330,  0.1610,
         0.1253, -0.1074,  0.0941,  0.0243, -0.1418,  0.0971,  0.1724, -0.0681,
         0.1032, -0.2295, -0.0591,  0.0328, -0.0941, -0.2001, -0.0844,  0.1446,
        -0.2465,  0.1063, -0.0096, -0.1720,  0.0201, -0.1410, -0.1146, -0.2183,
        -0.0093,  0.1951,  0.2237, 

In [16]:
cls_idx = stoi.get('<cls>',None)
cls_embed = TEXT.vocab.vectors[2]
cls_mean = torch.mean(cls_embed)
cls_var = torch.var(cls_embed)
print(cls_idx, cls_embed, cls_mean, cls_var)

None tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1

In [27]:
class TransformerModel(nn.Module):
    def __init__(self, words_num, target_size, d_model=300, nhead=6, num_encoder_layers=6,
                 dim_feedforward=2048, transformer_dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(words_num, d_model)
        # self.src_embed = nn.Embedding(config.words_num, config.words_dim)
        # self.tgt_embed = nn.Embedding()
        # if config.train_embed == False:
        #     self.embed.weight.requires_grad = False
        # self.src_mask = None
        self.pos_encoder = PositionalEncoding(d_model, transformer_dropout)
        encoderlayer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, transformer_dropout)
        encoder_norm = nn.LayerNorm(d_model)
        self.transformer_encoder = TransformerEncoder(encoderlayer, num_encoder_layers - 1, encoder_norm)
        self.last_encoder_layer = TransformerEncoder(encoderlayer, 1, encoder_norm)
        self.dropout = nn.Dropout(p=transformer_dropout)
        self.relu = nn.ReLU()
        self.hidden2tag = nn.Sequential(
            nn.Linear(self.d_model, self.d_model),
            nn.BatchNorm1d(self.d_model),
            self.relu,
            self.dropout,
            nn.Linear(self.d_model, target_size)
        )
        self.init_weights()

    def _generate_square_subsequent_mask(self, seq_len):
        mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src):
        # if self.src_mask is None or self.src_mask.size(0) != src.size(0):
        #     device = src.device
        #     mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
        #     self.src_mask = mask
        
        src = self.embed(src.text) #* math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output1 = self.transformer_encoder(src) # not use src_mask
        output2 = self.last_encoder_layer(output1)
        # output = self.decoder(output)
        output = torch.cat([output1, output2], dim=-1)
        output = output[0,:,:]
        output = self.hidden2tag(output)
        scores = F.log_softmax(output, dim=1)
        return scores

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [29]:
words_num = len(TEXT.vocab)
rel_label = len(RELATION.vocab)
model = TransformerModel(words_num=words_num, target_size=rel_label)
model.embed.weight.data.copy_(TEXT.vocab.vectors)

tensor([[-0.0351, -0.0331, -0.1351,  ...,  0.1418,  0.1129,  0.0408],
        [-0.0818, -0.0355,  0.1636,  ...,  0.1114, -0.0619, -0.0171],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        ...,
        [-0.1571, -0.1284,  0.1103,  ..., -0.0433,  0.0433,  0.2278],
        [ 0.1509, -0.2341,  0.1313,  ...,  0.2178,  0.0563,  0.2417],
        [-0.1843, -0.0851,  0.1242,  ..., -0.1547, -0.0273,  0.2128]])

In [30]:
print(model)

TransformerModel(
  (embed): Embedding(61334, 300)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
        )
     

In [25]:
print(model.transformer_encoder.layers[4])

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (linear1): Linear(in_features=300, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=300, bias=True)
  (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)
