# Model Construction


**Agend:**

1. initialize Model Parameter/初始化模型参数

2. Forword /前向传播

    2.1. Lexical Features /语法特征
    
    2.2. Sentence Features /句子特征
    
    2.3. Fully-connection Softmax 全连接分类


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init


class CNN(nn.Module):
    def __init__(self, word_vec, class_num, config):
        super().__init__()
        self.word_vec = word_vec
        self.class_num = class_num

        # hyper parameters and others
        self.max_len = config.max_len
        self.word_dim = config.word_dim
        self.pos_dim = config.pos_dim
        self.pos_dis = config.pos_dis

        self.dropout_value = config.dropout
        self.filter_num = config.filter_num
        self.window = config.window
        self.hidden_size = config.hidden_size

        self.dim = self.word_dim + 2 * self.pos_dim

        # load the pre-train model for word features
        self.word_embedding = nn.Embedding.from_pretrained(
            embeddings=self.word_vec,
            freeze=False,
        )
        # embedding for features for position distance relative to entity 1
        self.pos1_embedding = nn.Embedding(
            num_embeddings=2 * self.pos_dis + 3,
            embedding_dim=self.pos_dim
        )
         # embedding for features for position distance  relative to entity 2
        self.pos2_embedding = nn.Embedding(
            num_embeddings=2 * self.pos_dis + 3,
            embedding_dim=self.pos_dim
        )

        self.conv = nn.Conv2d(
            in_channels=1,
            out_channels=self.filter_num,
            kernel_size=(self.window, self.dim),
            stride=(1, 1),
            bias=False,
            padding=(1, 0),  # same padding
            padding_mode='zeros'
        )
        
        self.maxpool = nn.MaxPool2d((self.max_len, 1))
        self.tanh = nn.Tanh()
        
        self.dropout = nn.Dropout(self.dropout_value)
        
        self.linear = nn.Linear(
            in_features=self.filter_num,
            out_features=self.hidden_size,
            bias=False
        )
        self.dense = nn.Linear(
            in_features=self.hidden_size+6*self.word_dim,
            out_features=self.class_num,
            bias=False
        )

        # initialize weight
        init.xavier_normal_(self.pos1_embedding.weight)
        init.xavier_normal_(self.pos2_embedding.weight)
        init.xavier_normal_(self.conv.weight)
        #init.constant_(self.conv.bias, 0.)
        init.xavier_normal_(self.linear.weight)
        #init.constant_(self.linear.bias, 0.)
        init.xavier_normal_(self.dense.weight)
        #init.constant_(self.dense.bias, 0.)

    def encoder_layer(self, token, pos1, pos2):
        word_emb = self.word_embedding(token)  # B*L*word_dim
        pos1_emb = self.pos1_embedding(pos1)  # B*L*pos_dim
        pos2_emb = self.pos2_embedding(pos2)  # B*L*pos_dim
        emb = torch.cat(tensors=[word_emb, pos1_emb, pos2_emb], dim=-1)
        return emb  # B*L*D, D=word_dim+2*pos_dim

    def conv_layer(self, emb, mask):
        emb = emb.unsqueeze(dim=1)  # B*1*L*D
        conv = self.conv(emb)  # B*C*L*1

        # mask, remove the effect of 'PAD'
        conv = conv.view(-1, self.filter_num, self.max_len)  # B*C*L
        mask = mask.unsqueeze(dim=1)  # B*1*L
        mask = mask.expand(-1, self.filter_num, -1)  # B*C*L
        conv = conv.masked_fill_(mask.eq(0), float('-inf'))  # B*C*L
        conv = conv.unsqueeze(dim=-1)  # B*C*L*1
        return conv

    def single_maxpool_layer(self, conv):
        pool = self.maxpool(conv)  # B*C*1*1
        pool = pool.view(-1, self.filter_num)  # B*C
        return pool

    def forward(self, data):
        
        token = data[0][:, 0, :].view(-1, self.max_len)
        pos1 = data[0][:, 1, :].view(-1, self.max_len)
        pos2 = data[0][:, 2, :].view(-1, self.max_len)
        mask = data[0][:, 3, :].view(-1, self.max_len)
        lexical = data[1].view(-1, 6)
        
        lexical_emb = self.word_embedding(lexical)
        lexical_emb = lexical_emb.view(-1, self.word_dim * 6)
        
        emb = self.encoder_layer(token, pos1, pos2)
        emb = self.dropout(emb)
        conv = self.conv_layer(emb, mask)
        pool = self.single_maxpool_layer(conv)
        
        sentence_feature = self.linear(pool)
        sentence_feature = self.tanh(sentence_feature)
        sentence_feature = self.dropout(sentence_feature)
        
        features = torch.cat((lexical_emb, sentence_feature), 1)
        logits = self.dense(features)
        return logits

In [2]:
### parameter tuple
from collections import namedtuple
conf = namedtuple('conf',['max_len','word_dim','pos_dim',\
                          'pos_dis','dropout','filter_num','window','hidden_size'])

In [57]:
word_dim  = 300 # word dimenstion 
max_len = 64 # max_len for each setence input
pos_dim = 300 # dimentison for postion embeddinhg
pos_dis = 32 # position distance for relative position encoding
dropout = 0.5
filter_num = 16 # the number for feature map
window = 3 # kernal size of window
hidden_size = 100
num_words = 10

In [58]:
word_vec=torch.rand(num_words,word_dim)
class_num=19

In [59]:
config=conf(max_len,word_dim,pos_dim,pos_dis,dropout,filter_num,window,hidden_size)

In [60]:
model=CNN(word_vec, class_num, config)

In [61]:
print(model)

CNN(
  (word_embedding): Embedding(10, 300)
  (pos1_embedding): Embedding(67, 300)
  (pos2_embedding): Embedding(67, 300)
  (conv): Conv2d(1, 16, kernel_size=(3, 900), stride=(1, 1), padding=(1, 0), bias=False)
  (maxpool): MaxPool2d(kernel_size=(64, 1), stride=(64, 1), padding=0, dilation=1, ceil_mode=False)
  (tanh): Tanh()
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=16, out_features=100, bias=False)
  (dense): Linear(in_features=1900, out_features=19, bias=False)
)


word features

![](./Imgs/)

position features

![image.png](attachment:image.png)

In [102]:
# fake data
import numpy as np
# two sample 
lexical=torch.from_numpy(np.random.randint(10,size=(2, 6))).long()
# lexical input  = (left of entity1 ,entity1,right of entity 1 )
sentence=torch.from_numpy(np.random.randint(10,size=(2,4,max_len))).long()
# [words id, pos enc relative to  entity 1, pos enc relative to entity 2, mask] with max_len

In [103]:
lexical.type()

'torch.LongTensor'

In [104]:
data = (sentence,lexical)

In [105]:
token = data[0][:, 0, :].view(-1, max_len)
pos1 = data[0][:, 1, :].view(-1, max_len)
pos2 = data[0][:, 2, :].view(-1, max_len)
mask = data[0][:, 3, :].view(-1, max_len)
lexical = data[1].view(-1, 6)

In [106]:
word_embedding = nn.Embedding.from_pretrained(
    embeddings=word_vec,
    freeze=False,
)

In [107]:
lexical

tensor([[1, 5, 3, 1, 4, 9],
        [2, 4, 2, 9, 2, 6]])

In [108]:
lexical_emb = word_embedding(lexical)

In [110]:
lexical_emb.shape

torch.Size([2, 6, 300])

In [111]:
lexical_emb = lexical_emb.view(-1,word_dim * 6)

In [112]:
# embedding for features for position distance relative to entity 1
pos1_embedding = nn.Embedding(
    num_embeddings=2 * pos_dis + 3,
    embedding_dim=pos_dim
)
 # embedding for features for position distance  relative to entity 2
pos2_embedding = nn.Embedding(
    num_embeddings=2 * pos_dis + 3,
    embedding_dim=pos_dim
)

In [114]:
def encoder_layer(token, pos1, pos2):
    word_emb = word_embedding(token)  # B*L*word_dim
    pos1_emb = pos1_embedding(pos1)  # B*L*pos_dim
    pos2_emb = pos2_embedding(pos2)  # B*L*pos_dim
    emb = torch.cat(tensors=[word_emb, pos1_emb, pos2_emb], dim=-1)
    return emb  # B*L*D, D=word_dim+2*pos_dim

In [119]:
emb= encoder_layer(token, pos1, pos2)

In [122]:
emb.shape # (batch_size,max_len,[word_emd_feature + pos1_feaures + pos2_features ])

torch.Size([2, 64, 900])

In [121]:
# emb = self.dropout(emb)

In [125]:
dim = word_dim + 2 * pos_dim

In [129]:
dim

900

![image.png](attachment:image.png)

In [149]:
conv = nn.Conv2d(
    in_channels=1,
    out_channels=filter_num,
    kernel_size=(window, dim), # (3,900) [three word for convolution]
    stride=(1, 1),
    bias=False,
    padding=(1, 0),  # same padding -> add zero to the top
    padding_mode='zeros'
)

In [150]:
emb_unsqueeze = emb.unsqueeze(dim=1) # expand_dims 

In [151]:
emb_unsqueeze.shape

torch.Size([2, 1, 64, 900])

![image.png](attachment:image.png)

In [152]:
conv_output = conv(emb_unsqueeze)  # B*C*L*1

In [153]:
conv_output.shape

torch.Size([2, 16, 64, 1])

In [155]:
conv_output= conv_output.view(-1,filter_num,max_len)  # B*C*L

In [156]:
conv_output.shape

torch.Size([2, 16, 64])

In [158]:
mask.shape

torch.Size([2, 64])

In [162]:
mask.shape

torch.Size([2, 64])

In [163]:
x = torch.tensor([[1], [2], [3]])
x.size()

torch.Size([3, 1])

In [164]:
mask = mask.unsqueeze(dim=1)  # B*1*L

In [166]:
mask.shape

torch.Size([2, 1, 64])

In [167]:
mask = mask.expand(-1,filter_num, -1)  # B*C*L

In [168]:
mask.shape

torch.Size([2, 16, 64])

In [171]:
mask

tensor([[[4, 4, 6,  ..., 2, 4, 0],
         [4, 4, 6,  ..., 2, 4, 0],
         [4, 4, 6,  ..., 2, 4, 0],
         ...,
         [4, 4, 6,  ..., 2, 4, 0],
         [4, 4, 6,  ..., 2, 4, 0],
         [4, 4, 6,  ..., 2, 4, 0]],

        [[4, 4, 7,  ..., 2, 0, 4],
         [4, 4, 7,  ..., 2, 0, 4],
         [4, 4, 7,  ..., 2, 0, 4],
         ...,
         [4, 4, 7,  ..., 2, 0, 4],
         [4, 4, 7,  ..., 2, 0, 4],
         [4, 4, 7,  ..., 2, 0, 4]]])

In [170]:
conv_output.shape

torch.Size([2, 16, 64])

In [172]:
conv_output = conv_output.masked_fill_(mask.eq(0), float('-inf'))  # B*C*L

In [173]:
conv_output = conv_output.unsqueeze(dim=-1)  # B*C*L*1

In [174]:
conv_output.shape

torch.Size([2, 16, 64, 1])

```python
def conv_layer(self, emb, mask):
    emb = emb.unsqueeze(dim=1)  # B*1*L*D
    conv = self.conv(emb)  # B*C*L*1

    # mask, remove the effect of 'PAD'
    conv = conv.view(-1, self.filter_num, self.max_len)  # B*C*L
    mask = mask.unsqueeze(dim=1)  # B*1*L
    mask = mask.expand(-1, self.filter_num, -1)  # B*C*L
    conv = conv.masked_fill_(mask.eq(0), float('-inf'))  # B*C*L
    conv = conv.unsqueeze(dim=-1)  # B*C*L*1
    return conv
```

![image.png](attachment:image.png)

In [180]:
maxpool = nn.MaxPool2d((max_len, 1))

In [181]:
def single_maxpool_layer(conv):
    pool = maxpool(conv)  # B*C*1*1
    pool = pool.view(-1, filter_num)  # B*C
    return pool

In [183]:
pool = single_maxpool_layer(conv_output)

In [184]:
linear = nn.Linear(
    in_features=filter_num,
    out_features=hidden_size,
    bias=False
)

In [185]:
sentence_feature = linear(pool)

In [186]:
sentence_feature.shape

torch.Size([2, 100])

In [187]:
# ignore
# sentence_feature = self.tanh(sentence_feature)
# sentence_feature = self.dropout(sentence_feature)

In [189]:
lexical_emb.shape

torch.Size([2, 1800])

In [190]:
features = torch.cat((lexical_emb, sentence_feature), 1)

In [191]:
dense = nn.Linear(
    in_features=hidden_size+6*word_dim,
    out_features=class_num,
    bias=False
)

In [192]:
logits = dense(features)

In [193]:
logits.shape

torch.Size([2, 19])

In [194]:
logits

tensor([[ 0.1016,  0.3743, -0.4437, -0.4014, -0.2638, -0.3386,  0.7015, -0.2964,
         -0.1179,  0.0546,  0.0197, -0.0047,  0.1042, -0.1399, -0.6228,  0.0522,
         -0.1237,  0.1739, -0.0660],
        [-0.0889,  0.2659, -0.0040, -0.4476, -0.2312, -0.3862,  0.9071, -0.3338,
         -0.2121,  0.0182, -0.2821,  0.2306, -0.0267,  0.1076, -0.4381,  0.0658,
          0.0440,  0.3762, -0.1850]], grad_fn=<MmBackward>)