In [1]:
#Transformer with self attention and 2D-positional embeddings for reading form-like documents



import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import pandas as pd
from ast import literal_eval

class SelfAttentionWide(nn.Module):
    def __init__(self, emb, heads=8, mask=False):
        """
        :param emb:
        :param heads:
        :param mask:
        """

        super().__init__()

        self.emb = emb
        self.heads = heads
        self.mask = mask

        self.tokeys = nn.Linear(emb, emb * heads, bias=False)
        self.toqueries = nn.Linear(emb, emb * heads, bias=False)
        self.tovalues = nn.Linear(emb, emb * heads, bias=False)

        self.unifyheads = nn.Linear(heads * emb, emb)

    def forward(self, x):
        b, t, e = x.size()
        h = self.heads
        assert e == self.emb, f'Input embedding dim ({e}) should match layer embedding dim ({self.emb})'

        keys = self.tokeys(x).view(b, t, h, e)
        queries = self.toqueries(x).view(b, t, h, e)
        values = self.tovalues(x).view(b, t, h, e)

        # compute scaled dot-product self-attention

        # - fold heads into the batch dimension
        keys = keys.transpose(1, 2).contiguous().view(b * h, t, e)
        queries = queries.transpose(1, 2).contiguous().view(b * h, t, e)
        values = values.transpose(1, 2).contiguous().view(b * h, t, e)

        queries = queries / (e ** (1 / 4))
        keys = keys / (e ** (1 / 4))
        # - Instead of dividing the dot products by sqrt(e), we scale the keys and values.
        #   This should be more memory efficient

        # - get dot product of queries and keys, and scale
        dot = torch.bmm(queries, keys.transpose(1, 2))

        assert dot.size() == (b * h, t, t)

        if self.mask:  # mask out the upper half of the dot matrix, excluding the diagonal
            mask_(dot, maskval=float('-inf'), mask_diagonal=False)

        dot = F.softmax(dot, dim=2)
        # - dot now has row-wise self-attention probabilities

        # apply the self attention to the values
        out = torch.bmm(dot, values).view(b, h, t, e)

        # swap h, t back, unify heads
        out = out.transpose(1, 2).contiguous().view(b, t, h * e)

        return self.unifyheads(out)[0]


def make_context_vector(sentence, vocab):
    sent_temp = []
    for word in sentence:
        if word in vocab:
            sent_temp.append(vocab[word])
        else:
            sent_temp.append(vocab['<RARE/>'])       
    return torch.tensor(sent_temp, dtype=torch.long)


class EMB(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, NUM_Field):
        super(EMB, self).__init__()

        # positional embedding
        self.activation_pos_x = nn.ReLU()
        self.activation_pos_y = nn.ReLU()
        self.dropout_x = nn.Dropout(p=0.1)
        self.dropout_y = nn.Dropout(p=0.1)
        self.linear_pos_x = nn.Linear(1, int(embedding_dim / 2))
        self.linear_pos_y = nn.Linear(1, int(embedding_dim / 2))

        # self attention
        self.self_attention = SelfAttentionWide(2 * embedding_dim, 1)
        self.activation_att_expand = nn.ReLU()
        self.activation_att_shrink = nn.ReLU()
        self.linear_att_expand = nn.Linear(2 * embedding_dim, 4 * 2 * embedding_dim)
        self.linear_att_shrink = nn.Linear(4 * 2 * embedding_dim, 2 * embedding_dim)

        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
        self.norm2 = nn.LayerNorm(2 * embedding_dim, elementwise_affine=False)

        # candidate postion
        self.linear_box = nn.Linear(2, embedding_dim)
        self.activation_box = nn.ReLU()
        self.linear_last = nn.Linear(3 * embedding_dim, embedding_dim)

        # tokens embedding
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.activation_function1 = nn.ReLU()
        self.linear1 = nn.Linear(embedding_dim, 256)
        self.linear2 = nn.Linear(256, embedding_dim)
        # self.activation_function2 = nn.LogSoftmax(dim = -1)

        self.cos = nn.CosineSimilarity(dim=1)
        self.sig = nn.Sigmoid()
        self.emb = nn.Embedding(NUM_Field, embedding_dim)
        self.linear_emb1 = nn.Linear(embedding_dim, 256)
        self.linear_emb2 = nn.Linear(256, embedding_dim)
        self.linear3 = nn.Linear(embedding_dim, NUM_Field)

    def forward(self, inputs_tokens, field_tensor, cand_pos, inputs_pos):
        inputs_pos = -inputs_pos
        inputs_pos_x = self.activation_pos_x(inputs_pos[:, 0].unsqueeze(1))
        inputs_pos_x = self.linear_pos_x(inputs_pos_x)
        inputs_pos_y = self.activation_pos_y(inputs_pos[:, 1].unsqueeze(1))
        inputs_pos_y = self.linear_pos_y(inputs_pos_y)
        inputs_pos = torch.cat((inputs_pos_x, inputs_pos_y), 1)

        cand_position = self.linear_box(cand_pos)
        cand_position = self.norm(cand_position)
        candidate = self.embeddings(inputs_tokens)
        candidate = torch.cat((candidate, inputs_pos), 1)
        attn_output_next = self.self_attention(candidate.unsqueeze(0))
        embed_candidate = torch.max(attn_output_next, 0).values.view(1, -1)
        embed_candidate = self.norm2(embed_candidate)

        cand_full = torch.cat((embed_candidate, cand_position), 1)
        cand_full = self.activation_box(cand_full)
        cand_full = self.linear_last(cand_full)
        embed_field = self.emb(field_tensor)
        out = self.cos(cand_full, embed_field)
        out = self.sig(out)
        return out



vocab=json.load(open('vocab.json'))
vocab_size = len(vocab)
num_labels = 2
num_field = 2
labels = {"Positive": 1, "Negative": 0}
field_id = {"date": 0, "total": 1}
num_dim=10
num_epochs=50


train_data=pd.read_csv('small_sample_training.csv','\t')
train_data['cand_pos'] = train_data['cand_pos'].apply(literal_eval)
train_data['tokens_pos'] = train_data['tokens_pos'].apply(literal_eval)
train_data['can_tokens'] = train_data['can_tokens'].apply(literal_eval)

print(f'Training has very small number of {len(train_data):,} samples')
weight_classes=torch.FloatTensor([sum(train_data['can_targets']=='Positive')/len(train_data), sum(train_data['can_targets']=='Negative')/len(train_data)])    

    
model = EMB(vocab_size, num_dim,num_field)
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)    
print(f'The model has {count_parameters(model):,} trainable parameters')

#TRAINING
model.train()
logs = {}
for epoch in range(num_epochs):
    train_total_loss = 0
    train_correct=0
    test_total_loss_total = 0
    test_correct_total=0
    test_pred_total=[]
    test_target_total=[]
    test_total_loss_date = 0
    test_correct_date=0
    test_pred_date=[]
    test_target_date=[]
    model.train() 
    for i,row in train_data.iterrows():
        optimizer.zero_grad()
        context_vector = make_context_vector(row['can_tokens'], vocab)
        inputs_pos=torch.tensor(row['tokens_pos'], dtype=torch.float32)
        field_tensor = torch.tensor([field_id[row['can_labels']]], dtype=torch.long)
        cand_tensor=torch.tensor(row['cand_pos'], dtype=torch.float32).view(1, -1)
        target=torch.tensor([labels[row['can_targets']]]).to(torch.float32)
        log_probs = model(context_vector,field_tensor,cand_tensor,inputs_pos)
        loss=loss_function(log_probs, target)*(weight_classes[labels[row['can_targets']]])
        train_total_loss += loss.item()
        loss.backward()
        optimizer.step() 
        train_correct += (log_probs.round() == target).float().sum()

    train_accuracy = 100 * train_correct / len(train_data)
    train_loss=train_total_loss/len(train_data)
    print("Epoch {}/{}, Train_loss: {:.3f}, Train_accuracy: {:.3f}".format(epoch+1, num_epochs,train_loss, train_accuracy))







Training has very small number of 100 samples
The model has 21,204 trainable parameters
Epoch 1/50, Train_loss: 0.270, Train_accuracy: 26.000
Epoch 2/50, Train_loss: 0.266, Train_accuracy: 26.000
Epoch 3/50, Train_loss: 0.264, Train_accuracy: 30.000
Epoch 4/50, Train_loss: 0.263, Train_accuracy: 38.000
Epoch 5/50, Train_loss: 0.261, Train_accuracy: 45.000
Epoch 6/50, Train_loss: 0.260, Train_accuracy: 51.000
Epoch 7/50, Train_loss: 0.260, Train_accuracy: 54.000
Epoch 8/50, Train_loss: 0.259, Train_accuracy: 61.000
Epoch 9/50, Train_loss: 0.258, Train_accuracy: 61.000
Epoch 10/50, Train_loss: 0.257, Train_accuracy: 66.000
Epoch 11/50, Train_loss: 0.256, Train_accuracy: 67.000
Epoch 12/50, Train_loss: 0.255, Train_accuracy: 68.000
Epoch 13/50, Train_loss: 0.255, Train_accuracy: 68.000
Epoch 14/50, Train_loss: 0.254, Train_accuracy: 68.000
Epoch 15/50, Train_loss: 0.252, Train_accuracy: 67.000
Epoch 16/50, Train_loss: 0.251, Train_accuracy: 66.000
Epoch 17/50, Train_loss: 0.249, Train_acc