In [2]:
import pandas as pd
import json

data_path_aclarc = "./acl-arc/scaffolds/sections-scaffold-train.jsonl"
data_path_scicite = "./scicite/scaffolds/sections-scaffold-train.jsonl"
with open(data_path_aclarc, encoding='utf-8') as data_file:
    data = [json.loads(line) for line in data_file]
    df = pd.DataFrame(data).drop_duplicates()


#### Positive Sampling

In [3]:
sort_cols_section_paper = ['section_name', 'cited_paper_id']
sort_cols_section = ['section_name']
sort_cols = sort_cols_section_paper
final_cols = ['text', 'text_pos', 'section_name', 'citing_paper_id', 'cited_paper_id']

def split_and_concatenate(group):
    # Calculate the split index
    split_index = len(group) // 2
    
    # Split the group into two halves
    first_half = group.iloc[:split_index].reset_index(drop=True)['text']
    second_half = group.iloc[split_index:].reset_index(drop=True)
    second_half.rename(columns={'text': 'text_pos'}, inplace=True)

    # Concatenate the halves horizontally
    concatenated = pd.concat([first_half, second_half], axis=1)
    return concatenated

# Gets samples using concatenation
def get_pos_samples_concat(df):
    df_concat = df.copy(deep=True)

    # Dummy columns for groupby, to keep original columns
    include_groups = [i + '_drop' for i in sort_cols]
    df_concat[include_groups] = df_concat[sort_cols]
    
    result = df_concat.groupby(include_groups).apply(split_and_concatenate, include_groups=False).reset_index(drop=True)
    return result


In [4]:
# Gets samples using Right Outer join
def get_pos_samples_rj(df):
    df_sorted = df.sort_values(sort_cols).reset_index(drop=True)
    df_sorted['WithinGroupID'] = df_sorted.groupby(sort_cols).cumcount()

    # Calculate the size of each group and the split point
    df_sorted['group_sizes'] = df_sorted.groupby(sort_cols)['WithinGroupID'].transform('max') + 1
    df_sorted['cutoff'] = (df_sorted['group_sizes'] / 2).round().astype(int)


    # Split groups into half
    first_half = df_sorted[df_sorted['WithinGroupID'] < df_sorted['cutoff']]
    second_half = df_sorted[df_sorted['WithinGroupID'] >=  df_sorted['cutoff']].reset_index(drop=True)

    # Right Join on second half
    second_half = second_half.set_axis([i + '_pos' for i in second_half.columns], axis=1)
    result = first_half.merge(second_half, how='right', left_on=sort_cols, right_on=[i + '_pos' for i in sort_cols])

    selected_cols = ['text', 'text_pos', 'section_name_pos', 'citing_paper_id_pos', 'cited_paper_id_pos']
    result = result[selected_cols]
    
    return result.set_axis(final_cols, axis=1)

In [7]:
concat = get_pos_samples_concat(df)
rj = get_pos_samples_rj(df)

In [41]:
# Replace NA with text_pos (dropout in roberta will treat this as unsupervised learning)
def handle_na(df):
    df.loc[pd.isna(df['text']), 'text'] = df.loc[pd.isna(df['text'])]['text_pos']

handle_na(concat)
handle_na(rj)

concat[['text', 'text_pos']].to_csv('data_file.csv', index=False)

#### Hard Negative

#### Tokenise data

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

dataset = load_dataset("csv", data_files="data_file.csv")

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
column_names = dataset['train'].column_names

# Testing
def tokenize(examples, max_length=256):
    id_masks_all_cols = []
    col_name = column_names[0]
    length = len(examples[col_name])

    # Tokenize examples for each column
    for k in column_names:
        id_mask = tokenizer(examples[k], truncation=True, padding='max_length', max_length=max_length)
        id_masks_all_cols.append(id_mask)

    zipped_id_mask = {}
    id_mask_col = id_masks_all_cols[0]

    # Zips all columns together for each feature, input_id/attention_mask
    for feature in id_mask_col:
        zipped_id_mask[feature] = [[id_mask[feature][i] for id_mask in id_masks_all_cols] for i in range(length)]

    return zipped_id_mask

# Shape = [#features, #sentences, #samples(anchor, pos, neg)]
tokenized = dataset['train'].map(tokenize, batched=True, remove_columns=column_names)

  from .autonotebook import tqdm as notebook_tqdm


#### Create batch of examples

In [2]:
from torch.utils.data import DataLoader

tokenized.set_format("torch")

train_size = int(0.8 * len(tokenized))
test_size = len(tokenized) - train_size

small_train_dataset = tokenized.shuffle(seed=42).select(range(train_size))
test_dataset = tokenized.shuffle(seed=42).select(range(train_size, train_size+test_size))

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=32)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [19]:
len(test_dataloader)

215

#### NT-Xent Loss 

In [3]:
import torch
import torch.nn as nn

def contrastive_loss(embeddings, temperature=0.1, train=True):
    sents_per_vector = embeddings.size(1)

    if sents_per_vector < 2 or sents_per_vector > 3:
        raise Exception("Unexpected number of sentences per sample received. Expected: 2/3") 
    
    cos_sim = nn.CosineSimilarity(dim=-1)

    # Reshape to 3D for broadcast computation
    anchor = embeddings[:, 0].unsqueeze(1)
    positive = embeddings[:, 1].unsqueeze(0)
    
    # Pairwise cosine similarity, shape = [batch_size, batch_size]
    pairwise_sim = cos_sim(anchor, positive)

    # index of positive sample for corresponding anchors (matrix diagonal)
    target = torch.arange(pairwise_sim.size(0))

    # Horizontally concatenate hard_neg similarities (if any)
    if sents_per_vector == 3:
        hard_neg = embeddings[:, 2].unsqueeze(0)
        hard_neg_sim = cos_sim(anchor, hard_neg)
        pairwise_sim = torch.cat([pairwise_sim, hard_neg_sim], 1)
    
    pairwise_sim /= temperature

    if train:
        loss = nn.CrossEntropyLoss()
        output = loss(pairwise_sim, target)

        return output
    else:
        predicted = torch.argmax(pairwise_sim, dim=1)
        return predicted, target

In [4]:
from torch.optim import AdamW
from transformers import RobertaModel

def encoder(batch, model):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    batch_size, sents_per_vector, tensor_size = input_ids.shape

    # Flatten to encode all at once
    input_ids = torch.reshape(input_ids, (-1, tensor_size))
    attention_mask = torch.reshape(attention_mask, (-1, tensor_size))

    # Use [CLS] token representation
    # data augmentation handled by roberta, dropout implemented under the hood
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state[:, 0]

    # Add dropout layer for better performance?

    # Reshape back to nested tensors
    embeddings = torch.reshape(embeddings, (batch_size, sents_per_vector, -1))
    return embeddings

In [None]:
model = RobertaModel.from_pretrained('roberta-base')
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 2

for epoch in range(epochs):
    total_loss = 0
    # Shape = [#features, #batch_size, #tensor_length]
    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()

        embeddings = encoder(batch, model)
        loss = contrastive_loss(embeddings)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        if i % 10 == 0:
            print(f"Batch: {i+1}/{len(train_dataloader)}, Loss: {total_loss/(i+1)}")
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}")

save_directory = './pretrained'  # Specify your save directory
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [11]:
from datasets import load_metric

def evaluate(data_loader, model):
    y_pred, y_test = [], []
    model.eval()

    f1_metric = load_metric('f1')

    for i, batch in enumerate(data_loader):
        with torch.no_grad():
            outputs = encoder(batch, model)
            
        y_pred_batch, y_batch = contrastive_loss(outputs, train=False)
        y_test += list(y_batch.detach().numpy())
        y_pred += list(y_pred_batch.detach().numpy())

        print(y_pred_batch, y_batch)

        f1_metric.add_batch(predictions=y_pred_batch, references=y_batch)
        if i == 5:
            break
    
    return f1_metric.compute(average='macro')

In [12]:
model_names = ['roberta-base', './pretrained']

for name in model_names:
    model = RobertaModel.from_pretrained(name)
    f1 = evaluate(test_dataloader, model)
    print(f'f1 for {name}: {f1}')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


tensor([ 0,  1,  2,  8, 15,  5,  6,  7, 24,  6, 10, 11,  7, 13, 23, 15, 23, 17,
        10, 19, 20, 21,  3, 20, 24, 25,  6, 27, 15, 29,  8, 31]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([ 0,  1,  2,  3,  4, 18, 16,  7, 15,  9, 10, 11, 12,  3, 14, 15, 16, 17,
        18, 19,  4,  4, 22, 23, 24, 25, 26,  6, 28, 29, 16, 18]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([ 0, 12,  2,  0,  4,  5,  6,  7,  5, 30, 20, 11, 12, 13, 12, 15, 16, 17,
        18, 17, 28, 21, 22, 23, 24, 25, 26, 27, 28, 18, 30, 31]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([ 0,  1, 18, 13,  4,  5,  6,  7,  8, 28, 10, 18, 12, 13, 14, 15,  5, 17,
        18, 19, 30, 21, 22, 23, 24, 25, 26, 27, 28

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


tensor([ 0,  1,  2, 20,  4,  5,  6,  7,  7,  9, 10, 11, 12,  9,  0, 15,  4, 17,
        18, 18, 20,  8, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([ 0,  1,  2,  3, 27,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         8, 19, 20, 21, 22, 23, 24, 25,  3, 27,  3, 29, 30, 31]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([25,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 21, 16, 17,
        16, 19, 20, 21, 22, 17, 24, 25, 17, 27, 28, 29, 14, 16]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 31,  8, 11, 12, 13, 14, 15,  0, 17,
        18, 19, 20, 21, 22, 23, 24,  8, 26, 27, 28

In [7]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Prepare the input text
input_text = "Hello, world!"
texts = [input_text, input_text]
inputs = tokenizer(texts, return_tensors="pt")

# Encode the input
with torch.no_grad():
    outputs = model(**inputs)

# Extract the sentence embedding
sentence_embedding = outputs.last_hidden_state[:, 0]

print(sentence_embedding)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-5.6807e-02,  8.1177e-02, -1.0582e-02, -1.4226e-01,  5.8871e-02,
         -1.1665e-01, -1.6456e-02,  2.3507e-02,  7.6793e-02, -3.8536e-02,
         -2.0918e-02,  4.6355e-02,  3.7463e-02, -5.1474e-02,  6.1221e-02,
          2.5821e-02, -9.9584e-02, -1.1229e-02,  9.0533e-04, -3.7513e-02,
         -9.6354e-02,  6.0888e-02, -4.3323e-02,  1.0892e-01, -1.0954e-02,
          4.6057e-02,  8.5546e-02,  6.1048e-02, -6.7207e-02, -1.8888e-03,
         -2.4986e-02, -4.2359e-02,  5.6012e-02, -5.0210e-02,  3.4153e-02,
          8.0668e-02,  3.2120e-02, -1.4985e-02, -9.2031e-02, -6.0763e-03,
          2.5994e-03,  5.9213e-02, -5.8606e-03, -3.4089e-03,  6.7560e-02,
          3.3174e-03,  1.6809e-02,  5.4562e-02, -3.3795e-02,  3.0461e-02,
         -2.6839e-03,  9.2151e-02, -3.4292e-02,  1.3671e-02, -8.9881e-02,
          1.9958e-02, -1.5046e-03,  1.0379e-01,  7.3402e-02, -4.9641e-02,
         -5.9235e-03, -1.0554e-01, -1.1929e-01, -4.0894e-02,  8.7989e-03,
         -1.3469e-02, -4.9082e-02,  2.

In [8]:
sentence_embedding.shape

torch.Size([1, 768])