The system has two LSTM layers with 50 units, <br>
one for each context side, which concatenates the <br>
outputs and passes that to a feedforward layer <br>
with 64 neurons, followed by a dropout layer at <br>
rate 0.5, and a final one-neuron output layer of <br>
sigmoid activation.

In [25]:
import torch
from torch import tensor
from torch import nn
from torch import optim
import random

In [26]:
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('roberta.base', checkpoint_file='model.pt')
roberta.eval();  # disable dropout (or leave in train mode to finetune)

In [27]:
from json import loads

def load_data(file_name):
    data = []
    with open(file_name) as file:
            for line in file.readlines():
                data.append(loads(line))
    return data
                
def data2input(point,pad=48):
    word = point["word"]
    s1 = point["sentence1"]
    s2 = point["sentence2"]
    t1 = roberta.encode(word,s1)
    t2 = roberta.encode(word,s2)
    p1 = torch.zeros(pad-len(t1))
    t1 = torch.cat((t1,p1))
    p2 = torch.zeros(pad-len(t2))
    t2 = torch.cat((t2,p2))
    return t1,t2


In [28]:
class NeuralNet(nn.Module):
    
    def __init__(self, dim0, dim1, output_dim): # output = number tags
        super().__init__()
        
        self.lstm = nn.LSTM(dim0, dim1, 1, bias=False)
        #self.hidden_layer = nn.Linear(2*dim1,dim2)
        self.dropout = nn.Dropout(p=0.5)
        self.output_layer = nn.Linear(2*dim1, output_dim)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, t1,t2):

        _,(hidden_rep1,_) = self.lstm(t1.unsqueeze(0).unsqueeze(0))

        _,(hidden_rep2,_) = self.lstm(t2.unsqueeze(0).unsqueeze(0))

        hidden_rep = torch.cat((hidden_rep1.squeeze(0).squeeze(0),hidden_rep2.squeeze(0).squeeze(0)))
        
        #final_hidden_rep = self.hidden_layer(hidden_rep)
        
        drop = self.dropout(hidden_rep)
        
        output = self.output_layer(drop.squeeze(0))
        #output = self.relu(output)
        output = self.sigmoid(output)
        
        return output

In [29]:
train_data = load_data('train.jsonl')
test_data = load_data('test.jsonl')
val_data = load_data('val.jsonl')

In [30]:
our_wic = NeuralNet(48,127,2)

In [None]:
# Model Train 

epochs = 1000
ce = nn.CrossEntropyLoss()
softmax = nn.Softmax(dim=0)
optimizer = optim.SGD(our_wic.parameters(), lr=0.02)

optimizer = optim.SGD([
                {'params': our_wic.parameters()},
                {'params': roberta.parameters(), 'lr': 0.02}
            ], lr=0.02)

for i in range(epochs):
    random.shuffle(train_data)
    print("Epoch:",i)
    total_loss = 0
    for point in train_data:
        our_wic.train()
        roberta.train()
        optimizer.zero_grad()
        
        # a) calculate probs / get an output
        t1,t2 = data2input(point)
        y_raw = our_wic(t1,t2)
        #y_hat = softmax(y_raw)
        
        y = tensor(int(point["label"]))
        # b) compute loss
        loss = ce(y_raw.unsqueeze(0),y.unsqueeze(0))
        total_loss += loss
        # c) get the gradient
        loss.backward()

        # d) update the weights
        optimizer.step()
        
    print(total_loss/len(train_data))
    
    our_wic.eval()
    roberta.eval()

    score = 0
    for point in train_data:
        t1,t2 = data2input(point)
        output = our_wic(t1,t2).argmax()
        if bool(output) == point["label"]:
            score += 1

    print(score/len(train_data))
    
    score = 0
    for point in val_data:
        t1,t2 = data2input(point)
        output = our_wic(t1,t2).argmax()
        if bool(output) == point["label"]:
            score += 1

    print(score/len(val_data))

Epoch: 0
tensor(0.6940, grad_fn=<DivBackward0>)
0.5442151805453206
0.4952978056426332
Epoch: 1
tensor(0.6887, grad_fn=<DivBackward0>)
0.5514001473839352
0.49216300940438873
Epoch: 2
tensor(0.6880, grad_fn=<DivBackward0>)
0.5484524686809138
0.48589341692789967
Epoch: 3
tensor(0.6877, grad_fn=<DivBackward0>)
0.5442151805453206
0.5047021943573667
Epoch: 4
tensor(0.6862, grad_fn=<DivBackward0>)
0.5473470891672808
0.5109717868338558
Epoch: 5
tensor(0.6860, grad_fn=<DivBackward0>)
0.5490051584377302
0.5062695924764891
Epoch: 6
tensor(0.6840, grad_fn=<DivBackward0>)
0.5342667649226235
0.49843260188087773
Epoch: 7
tensor(0.6871, grad_fn=<DivBackward0>)
0.5464259395725866
0.512539184952978
Epoch: 8
tensor(0.6867, grad_fn=<DivBackward0>)
