In [22]:
import math
import random
import numpy as np
import json
import torch
from torch import nn
from collections import defaultdict
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModel, AutoConfig
from datasets import load_dataset
from torch.optim import AdamW

from sklearn.metrics import f1_score

In [23]:
file_path_train = 'scicite/train.jsonl'
file_path_dev = 'scicite/dev.jsonl'
file_path_test = 'scicite/test.jsonl'
train_data = []
dev_data = []
test_data = []
with open(file_path_train, 'r', encoding='utf-8') as file:
    for line in file:
        train_data.append(json.loads(line))
with open(file_path_dev, 'r', encoding='utf-8') as file:
    for line in file:
        dev_data.append(json.loads(line))
with open(file_path_test, 'r', encoding='utf-8') as file:
    for line in file:
        test_data.append(json.loads(line))

In [24]:
class CitationsDatasetWithoutInputExample():
    label_to_id = {'background': 0, 'method': 1, 'result': 2}
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]['string'], CitationsDatasetWithoutInputExample.label_to_id[self.data[item]['label']]

In [25]:
train_dataset = CitationsDatasetWithoutInputExample(train_data)
train_batch_size = 16
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [26]:
dev_dataset = CitationsDatasetWithoutInputExample(dev_data)
dev_batch_size = 16
dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=dev_batch_size)

In [27]:
class CitationIntentEncoder(nn.Module):
    def __init__(self, sciBert):
        super(CitationIntentEncoder, self).__init__()
        self.sentence_transformer = sciBert
        self.dense = nn.Linear(768, 768)
        self.activation = nn.Tanh()

    def forward(self, input_ids, attention_mask):
        embeddings = self.sentence_transformer(input_ids, attention_mask)
        cls_representation = embeddings.last_hidden_state[:, 0]
        x = self.dense(cls_representation)
        return self.activation(x)

def load_CLModel(save_directory):
    # Load trained model
    config = AutoConfig.from_pretrained(save_directory)
    sciBert = AutoModel.from_config(config)
    CL_model = CitationIntentEncoder(sciBert)

    CL_model.load_state_dict(torch.load(save_directory + '/CLModel_state_dict.bin'))
    return CL_model

In [28]:
class CitationIntentClassifier(nn.Module):
    def __init__(self, model_path, num_labels):
        super(CitationIntentClassifier, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
        self.sentence_transformer = load_CLModel(model_path)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_texts):
        tokenised = self.tokenizer(input_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=256)        
        embeddings = self.sentence_transformer(input_ids=tokenised['input_ids'], attention_mask=tokenised['attention_mask'])
        return self.classifier(embeddings)

def train_epoch(model, dataloader, loss_func, optimizer):
    model.train()
    total_loss = 0
    for input_texts, labels in dataloader:
        optimizer.zero_grad()
        output = model(input_texts)
        loss = loss_func(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Training loss: {total_loss / len(dataloader)}")

def evaluate(model, dataloader, loss_func):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for input_texts, labels in dataloader:
            output = model(input_texts)
            loss = loss_func(output, labels)
            total_loss += loss.item()
            total_correct += (output.argmax(1) == labels).sum().item()
            
    print(f"Evaluation loss: {total_loss / len(dataloader)}")
    print(f"Evaluation accuracy: {total_correct / len(dataloader.dataset)}")


In [30]:
test_dataset = CitationsDatasetWithoutInputExample(test_data)
test_batch_size = 16
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=test_batch_size)

In [31]:
def test(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for input_texts, labels in dataloader:
            output = model(input_texts)
            _, predicted_labels = torch.max(output, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels


def train_test_loop(model_name):
    num_labels = 3
    citation_intent_classifier = CitationIntentClassifier(model_name, num_labels)

    # Parameters
    learning_rate = 2e-5
    num_epochs = 5

    optimizer = torch.optim.Adam(citation_intent_classifier.parameters(), lr=learning_rate)
    loss_func = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        train_epoch(citation_intent_classifier, train_dataloader, loss_func, optimizer)
        evaluate(citation_intent_classifier, dev_dataloader, loss_func)
        
    predictions, true_labels = test(citation_intent_classifier, test_dataloader)
    f1 = f1_score(true_labels, predictions, average='macro')
    print(f"F1 Score: {f1}")

In [32]:
train_test_loop('./sectionPaper_mlp_without_hard')
#train_test_loop('./sectionPaper_without_hard')

Epoch 1/5
{'input_ids': tensor([[ 102,  111, 4234,  ...,    0,    0,    0],
        [ 102,  188,  121,  ...,    0,    0,    0],
        [ 102,  147, 2100,  ...,    0,    0,    0],
        ...,
        [ 102,  190,  111,  ...,    0,    0,    0],
        [ 102,  185, 1058,  ...,    0,    0,    0],
        [ 102,  260, 3291,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
loss caluclated
Training loss: 0.002158372670181038
{'input_ids': tensor([[  102,   407,   545,  ...,     0,     0,     0],
        [  102,   101, 10890,  ...,     0,     0,  