In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, AutoTokenizer, BertModel, DistilBertModel
from sklearn.metrics import f1_score

import csv
import re
import pathlib
import sys
import numpy as np

In [5]:
# Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN = 50

def tokenizerfnc(str):
    return tokenizer.encode(str,max_length=MAX_LEN, pad_to_max_length=True, add_special_tokens=True)

In [6]:
# data loader - for validation and test
class ConceptLoader(Dataset):
    def __init__(self, file_name):
        self.data = []
        with open(file_name, newline='') as file:
            data_reader = csv.reader(file, delimiter=',')
        # tokenization
        for row in data_reader:
            tokenized_concept_0 = tokenizerfnc(row[0])
            attn_masks_0 = [int(word!=0) for word in tokenized_concept_0]
            tokenized_concept_1 = tokenizerfnc(row[1])
            attn_masks_1 = [int(word!=0) for word in tokenized_concept_0]
            feats = row[2:]
            self.data.append( [torch.LongTensor(tokenized_concept_0), torch.LongTensor(attn_masks_0),
                               torch.LongTensor(tokenized_concept_1), torch.LongTensor(attn_masks_1),
                           torch.LongTensor(feats) ] )
                           

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

In [6]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

###########################
# LOSS
criterion = nn.CrossEntropyLoss()

#########################
# HYPER PARAMETER TUNING
NUM_EPOCHS = 5
BATCH_SIZE = 64
LEARNING_RATE = 2e-5

In [10]:
class ConceptEmbedding(nn.Module):
    def __init__(self):
        super(ConceptEmbedding,self).__init__()
        self.embed_size = 768 # TODO: Find the BERT Embedding size
        self.hidden_size = 256
        self.nlayers = 2
        self.out_size = 2
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.lstm1 = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers= self.nlayers)
        self.lstm2 = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers= self.nlayers) 

        # TODO, include other features
        # self.mlp =  ADD IN MLP

    def forward(self, x1, x2, attn_mask1, attn_mask2, others):
        x1, _ = self.bert_layer(x1, attention_mask = attn_mask1)
        x1 = x1[:, 1:].permute(1,0,2) # BxLxH -> LxBxH
        h1, _ = self.lstm1(x1)
        h1 = h1[-1]
        
        x2, _ = self.bert_layer(x2, attention_mask = attn_mask2)
        x2 = x2[:, 1:].permute(1,0,2) # BxLxH -> LxBxH
        h2, _ = self.lstm2(x2)
        h2 = h2[-1]
        # TODO
        # Bring in reference features
        return h1,h2

In [11]:
#############################
# HELPER FUNCTIONS

def train(model, train_loader, epoch):
    '''
    Args:

    Ret: 
    '''
    total = 0
    accuracy = 0
    all_labels = []
    all_predictions = []    
    running_loss = 0
    model.train()
    for batch_num, (feats1,label1,attn_mask1, feats2,label2,attn_mask2) in enumerate(train_loader):
        feats = feats.to(DEVICE)
        label = label1.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)

        optimizer.zero_grad()
        out = model(feats, attn_mask)

        # task loss
        loss = criterion1(out, label1)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if batch_num % 50 == 0:
            print("Batch ", batch_num, " done. Loss is ", loss.item())

        # getting predictions
        _, pred_labels = torch.max(F.softmax(out_task2, dim=1), 1)
        all_labels.extend(list(np.array(label2.cpu())))
        predictions = pred_labels.cpu()
        all_predictions.extend(list(np.array(predictions)))

        del(feats)
        del(label)
        del(attn_mask)

    running_loss /= len(train_loader)
    print("Loss of epoch ", (epoch+1), " is ", running_loss)
    score = f1_score(all_labels, all_predictions)
    print("Train Score: ", score)
    #print(all_predictions)
    return


def validate(model, valid_loader, epoch):
    '''
    Args:

    Ret:
    '''

    all_predictions = []
    all_labels = []

    model.eval()
    for batch_num, (feats,label,attn_mask) in enumerate(valid_loader):
        feats = feats1.to(DEVICE)
        label = label1.to(DEVICE)
        attn_mask = attn_mask1.to(DEVICE)
        out = model.evaluate(feats, attn_mask)

        _, pred_labels = torch.max(F.softmax(out_task1, dim=1), 1)
        all_labels.extend(list(np.array(label1.cpu())))
        predictions = pred_labels.cpu()
        all_predictions.extend(list(np.array(predictions)))

        del(feats)
        del(label)
        del(attn_mask)
        torch.cuda.empty_cache()
    
    # calculuate f1-scores
    score = f1_score(all_labels, all_predictions)
    print("Test score: ", score)

    return