In [2]:
import random
import itertools
import pandas as pd
import numpy as np
import math

from math import sqrt as msqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import torch
import torch.functional as F
from torch import nn
from torch.optim import Adadelta
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm

# BERT 

In [3]:
# the maximum of length of sequences
max_len = 10 * 2 + 3
# the number of tokens (objects or attributes)
max_vocab = 166
# the maximum number of masked tokens
max_pred = 4
# dimension of key, values. the dimension of query and key are the same 
d_k = d_v = 32
# dimension of embedding
d_model = 224  # n_heads * d_k
# dimension of hidden layers
d_ff = d_model * 4

# number of heads
n_heads = 7
# number of encoders
n_layers = 7
# the number of input setences
n_segs = 2

p_dropout = .1

#80% the chosen token is replaced by [mask], 10% is replaced by a random token, 10% do nothing
p_mask = .8
p_replace = .1
p_do_nothing = 1 - p_mask - p_replace

device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)

$$
\displaylines{
\operatorname{GELU}(x)=x P(X \leq x)= x \Phi(x)=x \cdot \frac{1}{2}[1+\operatorname{erf}(x / \sqrt{2})] \\
 or \\
0.5 x\left(1+\tanh \left[\sqrt{2 / \pi}\left( x+ 0.044715 x^{3}\right)\right]\right)
}
$$

In [4]:
def gelu(x):
    '''
    Two way to implements GELU:
    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    or
    0.5 * x * (1. + torch.erf(torch.sqrt(x, 2))) 
    '''
    return .5 * x * (1. + torch.erf(x / msqrt(2.)))

#  create a mask tensor to identify the padding tokens in a batch of sequences
def get_pad_mask(tokens, pad_idx=0):
    '''
    suppose index of [PAD] is zero in word2idx
    the size of input tokens is [batch, seq_len]
    '''
    batch, seq_len = tokens.size()
    pad_mask = tokens.data.eq(pad_idx).unsqueeze(1) #.unsqueeze(1) adds a dimension and turns it to column vectors
    pad_mask = pad_mask.expand(batch, seq_len, seq_len)
    
    # The size of pad_mask is [batch, seq_len, seq_len]
    # The resulting tensor has True where padding tokens are located and False elsewhere.
    
    # print(f'the shape of pad_mask is {pad_mask.shape}')
    return pad_mask

In [5]:
# process input tokens to dense vectors before passing them to encoder.
class Embeddings(nn.Module):
    def __init__(self):
        super(Embeddings, self).__init__()
        self.seg_emb = nn.Embedding(n_segs, d_model)
        '''
        convert indices into vector embeddings.
        max_vocab can be replaced by formal context object vectors or attribute vectors
        '''
        self.word_emb = nn.Embedding(max_vocab, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p_dropout)

    def forward(self, x, seg):
        '''
        x: [batch, seq_len]
        '''
        word_enc = self.word_emb(x)
        
        '''
        maybe positional embedding can be deleted
        '''
        # # positional embedding
        # pos = torch.arange(x.shape[1], dtype=torch.long, device=device) # .long: round down
        # pos = pos.unsqueeze(0).expand_as(x) # the shape is [1, seq_len]
        # pos_enc = self.pos_emb(pos)

        seg_enc = self.seg_emb(seg)
        x = self.norm(word_enc + seg_enc)
        return self.dropout(x)
        # return: [batch, seq_len, d_model]

$$
\operatorname{Attention}(Q, K, V) = \operatorname{softmax}(\frac{QK^T}{\sqrt{d_k}})V
$$

$$
\begin{aligned}
\operatorname{MultiHead}(Q, K, V) &= \operatorname{Concat}(\text{head}_1, \text{head}_2, \dots, \text{head}_h)W^O \\
\text{where } \text{head}_i &= \operatorname{Attention}(QW^Q_i, KW^K_i, VW^V_i)
\end{aligned}
$$

In [6]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2) / msqrt(d_k))
        # scores: [batch, n_heads, seq_len, seq_len]
        # fill the positions in the scores tensor where the attn_mask is True with a very large negative value (-1e9). 
        scores.masked_fill_(attn_mask, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        # context: [batch, n_heads, seq_len, d_v]
        context = torch.matmul(attn, V)
        return context

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)

    def forward(self, Q, K, V, attn_mask):
        '''
        Q, K, V: [batch, seq_len, d_model]
        attn_mask: [batch, seq_len, seq_len]
        '''
        batch = Q.size(0)
        '''
        split Q, K, V to per head formula: [batch, seq_len, n_heads, d_k]
        Convenient for matrix multiply opearation later
        q, k, v: [batch, n_heads, seq_len, d_k or d_v]
        '''
        per_Q = self.W_Q(Q).view(batch, -1, n_heads, d_k).transpose(1, 2)
        per_K = self.W_K(K).view(batch, -1, n_heads, d_k).transpose(1, 2)
        per_V = self.W_V(V).view(batch, -1, n_heads, d_v).transpose(1, 2)

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        # context: [batch, n_heads, seq_len, d_v]
        context = ScaledDotProductAttention()(per_Q, per_K, per_V, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch, -1, n_heads * d_v)

        # output: [batch, seq_len, d_model]
        output = self.fc(context)
        return output

$$\operatorname{FFN}(x)=\operatorname{GELU}(xW_1+b_1)W_2+b_2$$

In [7]:
class FeedForwardNetwork(nn.Module):
    def __init__(self):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p_dropout)
        self.gelu = gelu

    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.gelu(x)
        x = self.fc2(x)
        return x

In [8]:
# Encoder
# pre-LN is easier to train than post-LN, but if fullly training, post_LN have better result than pre-LN. 

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.enc_attn = MultiHeadAttention()
        self.ffn = FeedForwardNetwork()

    def forward(self, x, pad_mask):
        '''
        pre-norm
        see more detail in https://openreview.net/pdf?id=B1x8anVFPr

        x: [batch, seq_len, d_model]
        '''
        residual = x
        x = self.norm1(x)
        x = self.enc_attn(x, x, x, pad_mask) + residual
        residual = x
        x = self.norm2(x)
        x = self.ffn(x)
        return x + residual

In [9]:
# next sentence prediction
# pooled representation of the entire sequence as the [CLS] token representation.
'''
The full connected linear layer improve the result while making the model harder to train.
'''
class Pooler(nn.Module):
    def __init__(self):
        super(Pooler, self).__init__()
        self.fc = nn.Linear(d_model, d_model)
        self.tanh = nn.Tanh()

    def forward(self, x):
        '''
        x: [batch, d_model] (first place output)
        '''
        x = self.fc(x)
        x = self.tanh(x)
        return x

In [10]:
class BERT(nn.Module):
    def __init__(self, n_layers):
        super(BERT, self).__init__()
        self.embedding = Embeddings()
        self.encoders = nn.ModuleList([
            EncoderLayer() for _ in range(n_layers)
        ])

        self.pooler = Pooler()
        
        # next sentence prediction. output is 0 or 1.
        self.next_cls = nn.Linear(d_model, 2)
        self.gelu = gelu
        
        # Sharing weight between some fully connect layer, this will make training easier.
        shared_weight = self.pooler.fc.weight
        self.fc = nn.Linear(d_model, d_model)
        self.fc.weight = shared_weight

        shared_weight = self.embedding.word_emb.weight
        self.word_classifier = nn.Linear(d_model, max_vocab, bias=False)
        self.word_classifier.weight = shared_weight

    def forward(self, tokens, segments, masked_pos):
        output = self.embedding(tokens, segments)
        enc_self_pad_mask = get_pad_mask(tokens)
        for layer in self.encoders:
            output = layer(output, enc_self_pad_mask)
        # output: [batch, max_len, d_model]

        # NSP Task
        '''
        Extracting the [CLS] token representation, 
        passing it through the pooler, 
        and making predictions.
        '''
        hidden_pool = self.pooler(output[:, 0]) # only the [CLS] token
        logits_cls = self.next_cls(hidden_pool)

        # Masked Language Model Task
        '''
        extracting representations of masked positions, 
        passing them through a fully connected layer, 
        applying the GELU activation function, 
        and making predictions using the word classifier
        '''
        # masked_pos: [batch, max_pred] -> [batch, max_pred, d_model]
        masked_pos = masked_pos.unsqueeze(-1).expand(-1, -1, d_model)

        # h_masked: [batch, max_pred, d_model]
        h_masked = torch.gather(output, dim=1, index=masked_pos)
        h_masked = self.gelu(self.fc(h_masked))
        logits_lm = self.word_classifier(h_masked)
        # logits_lm: [batch, max_pred, max_vocab]
        # logits_cls: [batch, 2]

        return logits_cls, logits_lm, hidden_pool

# Data Preparation


In [12]:
def process_concepts_from_file(filename) :
    extents = []

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            # Split the line based on four blank spaces
            parts = line.split('    ')

            # Extract the right sequence (assuming it's the second part after splitting)
            if len(parts) >= 2:
                extent = parts[1].strip()
                extents.append(extent)

    object_list = list(set(" ".join(extents).split()))
    sorted_object_list = sorted(map(int, object_list))

    print(len(object_list))
    # Create the object2idx dictionary
    object2idx = {str(obj): idx + 1  for idx, obj in enumerate(sorted_object_list)}
    sorted_object_list = list(map(str, sorted_object_list ))

    special_tokens = {'[PAD]': max_vocab - 4, '[CLS]': max_vocab - 3, '[SEP]': max_vocab - 2, '[MASK]': max_vocab - 1}

    object2idx.update(special_tokens)
    # print(len(object2idx))

    idx2object = {idx: object for object, idx in object2idx.items()}
    vocab_size = len(object2idx)
    # assert len(object2idx) == len(idx2object)

    extent_token_list = []
    for extent in extents:
        extent_token_list.append([
            object2idx[s] for s in extent.split()
        ])
        
    return extent_token_list, object2idx, idx2object

extent_token_list, object2idx, idx2object = process_concepts_from_file('paper-keywords-before2015_concepts.txt')
extent_token_list_new, object2idx2, idx2object2 = process_concepts_from_file('paper-keywords-all_concepts.txt')

162
162


In [13]:
maxlen = 0
for extent in extent_token_list :
    maxlen = max(len(extent), maxlen)
print(maxlen)

10


In [14]:
# padding the token lists to have the same length.
def padding(ids, n_pads, pad_symb=0):
    return ids.extend([pad_symb for _ in range(n_pads)])

def masking_procedure(cand_pos, input_ids, masked_symb='[MASK]'):
    masked_pos = []
    masked_tokens = []
    for pos in cand_pos:
        masked_pos.append(pos)
        masked_tokens.append(input_ids[pos])
        if random.random() < p_mask:
            input_ids[pos] = masked_symb
        elif random.random() > (p_mask + p_replace):
            rand_word_idx = random.randint(0, max_vocab - 4)
            input_ids[pos] = rand_word_idx

    return masked_pos, masked_tokens

In [15]:
def get_neighbor_samples(extents) :
    n = len(extents)
    samples = []

    dep = np.zeros(shape = (n, n), dtype = np.int32)
    neighbor = np.zeros(shape = (n, n), dtype = np.int32)

    for i in range(n) :
        for j in range(i + 1, n) :
            if set(extents[i]).issubset(set(extents[j])) :
                dep[i][j] = 1
            if set(extents[j]).issubset(set(extents[i])) :
                dep[j][i] = 1

    for i in range(n) :
        se = set([])
        for j in range(n) :
            if j != i :
                if dep[j][i] == 1 :
                    rep = False
                    lst = list(se)
                    for idk, k in enumerate(lst) :
                        if dep[k][j] :
                            se.remove(k)
                            se.add(j)
                            rep = True
                        if dep[j][k] :
                            rep = True
                    if not rep :
                        se.add(j)

        for j in range(n) :
            if j in se :
                samples.append([i, j, True])
            elif random.random() < 0.0018 :
                samples.append([i, j, False])
        
    return samples

all_samples = get_neighbor_samples(extent_token_list)
print(len(all_samples))

8153


In [21]:
def make_data(extents, all_samples, word2idx, n_data, num_per_sample = 120):
    batch_data = []
    positive = negative = 0
    max_len = 0
    len_sentences = len(extents)
    for extent in extents :
        max_len = max(max_len, len(extent))
    max_len = max_len * 2 + 3
        
    for sample in all_samples :
        
        tokens_a_idx = sample[0]
        tokens_b_idx = sample[1]
        tokens_a = extent_token_list[tokens_a_idx]
        tokens_b = extent_token_list[tokens_b_idx]
             
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        segment_ids = [0 for i in range(
            1 + len(tokens_a) + 1)] + [1 for i in range(1 + len(tokens_b))]

        # Determines the number of positions to mask (n_pred) based on the input sequence length.
        n_pred = min(max_pred, max(1, int(len(input_ids) * .15)))
        cand_pos = [i for i, token in enumerate(input_ids)
                    if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] #exclude special tokens.

        # shuffle all candidate position index, to sampling maksed position from first n_pred
        masked_pos, masked_tokens = masking_procedure(
            cand_pos[:n_pred], input_ids, word2idx['[MASK]'])

        # zero padding for tokens to ensure that the input sequences and segment IDs have the maximum sequence length
        padding(input_ids, max_len - len(input_ids))
        # print("the size of input_ids is " ,len(input_ids))
        padding(segment_ids, max_len - len(segment_ids))
        # print("the size of segment_ids is " ,len(segment_ids))

        # zero padding for mask
        if max_pred > n_pred:
            n_pads = max_pred - n_pred
            padding(masked_pos, n_pads)
            padding(masked_tokens, n_pads)

        # Creating Batch Data:
        batch_data.append(
            [input_ids, segment_ids, masked_tokens, masked_pos, sample[2]])

    random.shuffle(batch_data)
    print(len(batch_data))
    return batch_data


class BERTDataset(Dataset):
    def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, is_next):
        super(BERTDataset, self).__init__()
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_tokens = masked_tokens
        self.masked_pos = masked_pos
        self.is_next = is_next

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.segment_ids[index], self.masked_tokens[index], self.masked_pos[index], self.is_next[index]

# Pre-Train BERT

In [22]:
DO_NSP_TEST = False

In [24]:
batch_size = 32 # 必须为偶数
lr = 1.9e-5
epochs = 1200

In [25]:
device = torch.device('cuda')
train_samples, test_samples = [], []

if DO_NSP_TEST :
    train_samples, test_samples = train_test_split(all_samples, test_size=0.2, random_state=42)
else :
    train_samples = all_samples

batch_data = make_data(extent_token_list, train_samples, object2idx, n_data=len(all_samples))

batch_tensor = [torch.LongTensor(ele) for ele in zip(*batch_data)]
dataset = BERTDataset(*batch_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
model = BERT(n_layers)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lr)
model.to(device)

print('Entering training process...')

for epoch in range(epochs):
    bat = 0
    for one_batch in dataloader:
        input_ids, segment_ids, masked_tokens, masked_pos, is_next = [ele.to(device) for ele in one_batch]

        logits_cls, logits_lm, _ = model(input_ids, segment_ids, masked_pos)

        loss_cls = criterion(logits_cls, is_next)
        loss_lm = criterion(logits_lm.view(-1, max_vocab), masked_tokens.view(-1))
        loss_lm = (loss_lm.float()).mean()
        loss = loss_cls + loss_lm
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # print(f'Epoch:{epoch + 1} \t loss: {loss:.6f}')
    
    # 每30个epoch保存一次模型
    if epoch % 50 == 0 :
        torch.save(model.state_dict(), 'oo_no_pos_pretrained.dat')
        print(f'Epoch:{epoch}\t loss: {loss:.6f}')

7165
Entering training process...
Epoch:0	 loss: 1.861504
Epoch:50	 loss: 0.451407
Epoch:100	 loss: 0.153402
Epoch:150	 loss: 0.160886
Epoch:200	 loss: 0.059680
Epoch:250	 loss: 0.099437
Epoch:300	 loss: 0.082009
Epoch:350	 loss: 0.072126
Epoch:400	 loss: 0.141783
Epoch:450	 loss: 0.071086
Epoch:500	 loss: 0.091308
Epoch:550	 loss: 0.045794
Epoch:600	 loss: 0.111170
Epoch:650	 loss: 0.091328
Epoch:700	 loss: 0.117798
Epoch:750	 loss: 0.044200
Epoch:800	 loss: 0.112777
Epoch:850	 loss: 0.219594
Epoch:900	 loss: 0.071489
Epoch:950	 loss: 0.064486
Epoch:1000	 loss: 0.069704
Epoch:1050	 loss: 0.067912
Epoch:1100	 loss: 0.036912
Epoch:1150	 loss: 0.121193


# Neighboring Concept Prediction

In [120]:
labels = [sample[2] for sample in test_samples]
# print(labels)

num_true = labels.count(True)
num_false = labels.count(False)

# Print the counts
print("Number of True:", num_true)
print("Number of False:", num_false)

labels_mapping = {"True": 1, "False": 0}
labels_01 = [labels_mapping[str(sample[2])] for sample in test_samples]
# print(labels_01)

Number of True: 0
Number of False: 0


In [838]:
if DO_NSP_TEST :
    pretrained_model = BERT(n_layers)
    pretrained_model.eval()
    pretrained_model.load_state_dict(torch.load('oo_no_pos_pretrained.dat'))
    pretrained_model.to(device)


    predictions = []

    # input_ids, segment_ids, masked_tokens, masked_pos, is_next = batch_data[test_data_idx]
    #     input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    #     segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(device)
    #     masked_pos = torch.LongTensor(masked_pos).unsqueeze(0).to(device)
    #     masked_tokens = torch.LongTensor(masked_tokens).unsqueeze(0).to(device)
    #     logits_cls, logits_lm = model(input_ids, segment_ids, masked_pos)
    #     input_ids, segment_ids, masked_tokens, masked_pos, is_next = batch_data[test_data_idx]

    for sample in test_samples:
        index_a = sample[0]
        index_b = sample[1]
        tokens_a = extent_token_list[index_a]
        tokens_b = extent_token_list[index_b]

        input_ids = torch.tensor([object2idx['[CLS]']] + tokens_a + [object2idx['[SEP]']] + tokens_b + [object2idx['[SEP]']])
        segment_ids = torch.tensor([0 for i in range(
                        1 + len(tokens_a) + 1)] + [1 for i in range(1 + len(tokens_b))])
        masked_pos = torch.tensor([0 for i in range(
                        1 + len(tokens_a) + 1)] + [0 for i in range(1 + len(tokens_b))])
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
        segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(device)
        masked_pos = torch.LongTensor(masked_pos).unsqueeze(0).to(device)


        logits_cls, _, _ = pretrained_model(input_ids, segment_ids, masked_pos)
        cpu = torch.device('cpu')
        pred_next = logits_cls.data.max(1)[1].data.to(cpu).numpy()[0]
        predictions.append(pred_next) 

    # Calculate metrics
    accuracy = accuracy_score(labels_01, predictions)
    precision = precision_score(labels_01, predictions)
    recall = recall_score(labels_01, predictions)
    f1 = f1_score(labels_01, predictions)
    roc_auc = roc_auc_score(labels_01, predictions)

    # Print metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("AUC Score:", roc_auc)
else :
    print('NSP TEST is disabled since DO_NSP_TEST is set to False.')

Accuracy: 0.861244019138756
Precision: 0.7713004484304933
Recall: 0.9608938547486033
F1 Score: 0.8557213930348258
AUC Score: 0.8737523667048457


# Fine-Tune

## Data Preparation

In [26]:
def get_true_permutes(extent_token_list, tup_len = 3) :
    true_permutes = []
    dist = [0 for i in range(tup_len + 1)]
    
    for extent in extent_token_list :
        extent_len = len(extent)
        
        for now_len in range(2, tup_len + 1) :
            if extent_len >= now_len :
                now_pmt = [' '.join([str(ele) for ele in list(p)] + ['0' for _ in range(tup_len - now_len)]) for p in itertools.combinations(extent, now_len)]
 #               now_pmt = [' '.join([str(ele) for ele in list(p)] + ['0' for _ in range(tup_len - now_len)]) for p in itertools.permutations(extent, now_len)]
            else :
                now_pmt = []
                
            true_permutes.extend(now_pmt)
            dist[now_len] += len(now_pmt)

    true_permutes = set(true_permutes)
    
    return true_permutes, np.array(dist, dtype = np.float64)

def pad_negative_samples(object2idx, true_permutes, length_distribution, number) :
    lengths = np.arange(0, len(length_distribution))
    tup_len = len(length_distribution) - 1 

    print(lengths)
    print(length_distribution)
    print(np.sum(length_distribution))
    
    length_distribution[-1] = 1.0 - np.sum(length_distribution[0:-1])
    length_distribution /= np.sum(length_distribution)
    
    print(length_distribution)
    print(np.sum(length_distribution))
    
    object_list = []
    for obj in object2idx :
        if not '[' in obj :
            object_list.append(object2idx[obj])
    
    negative_samples = []
    while len(negative_samples) < number :
        length = np.random.choice(lengths, p=length_distribution)

        tmp_list = random.sample(object_list, length)
        if length < tup_len :
            tmp_list.extend([0 for _ in range(tup_len - length)])
        
        tmp_str = ' '.join([str(x) for x in tmp_list])
        if tmp_str in true_permutes :
            continue

        negative_samples.append((tmp_list, False))
    return negative_samples

def prepare_object_list_data(object2idx, extent_token_list, extent_token_list_new, tup_len = 3) :
    old_true_permutes, old_distribution = get_true_permutes(extent_token_list, tup_len)
    new_true_permutes, new_distribution = get_true_permutes(extent_token_list_new, tup_len)
    added_true_permutes = new_true_permutes - old_true_permutes
    added_distribution = new_distribution - old_distribution
    added_distribution /= np.sum(added_distribution)
    
    print(old_distribution)
    print(new_distribution)
    
    train_samples = []
    test_samples = []
    
    for perm_str in old_true_permutes :
        lst = [int(x) for x in perm_str.split(' ')]
        train_samples.append((lst, True))
    for perm_str in added_true_permutes :
        lst = [int(x) for x in perm_str.split(' ')]
        test_samples.append((lst, True))
    
    train_len = len(train_samples)
    test_len = len(test_samples)
    
    negative_samples = pad_negative_samples(object2idx, new_true_permutes, list(added_distribution), train_len + test_len)
    train_negative_samples, test_negative_samples = train_test_split(negative_samples, test_size=test_len / (train_len + test_len), random_state=42)

    train_samples.extend(train_negative_samples)
    test_samples.extend(test_negative_samples)
    
    random.shuffle(train_samples)
    random.shuffle(test_samples)
    
    return train_samples, test_samples

max_lenn = 0
for extent in extent_token_list :
    max_lenn = max(max_lenn, len(extent))
print(max_lenn)

train_labeled_lists, test_labeled_lists = prepare_object_list_data(object2idx, extent_token_list, extent_token_list_new, tup_len = 5)

10
[   0.    0. 4004. 2625. 1654.  905.]
[   0.    0. 5568. 3793. 2338. 1238.]
[0 1 2 3 4 5]
[0.0, 0.0, 0.4171779141104294, 0.31154974659909307, 0.18244865297412644, 0.08882368631635103]
1.0
[0.         0.         0.41717791 0.31154975 0.18244865 0.08882369]
1.0


In [27]:
# check the ratio of 1 and 0
df = pd.DataFrame(test_labeled_lists, columns=['Pair', 'Label'])

# Calculate the ratios
ratio_zeros = (df['Label'] == 0).mean()
ratio_ones = (df['Label'] == 1).mean()

print(f"in test set ratio of 0s: {ratio_zeros:.3f}")
print(f"in test set ratio of 1s: {ratio_ones:.3f}")

df2 = pd.DataFrame(train_labeled_lists, columns=['Pair', 'Label'])

# Calculate the ratios
ratio_zero = (df2['Label'] == 0).mean()
ratio_one = (df2['Label'] == 1).mean()

print(f"in train set ratio of 0s: {ratio_zero:.3f}")
print(f"in train set ratio of 1s: {ratio_one:.3f}")

print('train set size ' + str(len(train_labeled_lists)))
print('test set size ' + str(len(test_labeled_lists)))

in test set ratio of 0s: 0.500
in test set ratio of 1s: 0.500
in train set ratio of 0s: 0.500
in train set ratio of 1s: 0.500
train set size 13360
test set size 3504


## Fine-Tune Model

##  MLP for classification task

In [28]:
# design a MLP for classification task
class MLP(nn.Module):
    def __init__(self, bert_model, embedding_size, hidden_size, output_size, dropout_rate = .1):
        super(MLP, self).__init__()
        
        self.bert = bert_model

        self.fc1 = nn.Linear(embedding_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs, segments, masked_poses):
        _, __, x = self.bert(inputs, segments, masked_poses)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x
def prepare_data(pair_set):
    inputs = []
    labels = []
    for lst, label in pair_set:
        inputs.append(lst)
        labels.append(label)
    return torch.tensor(inputs), torch.tensor(labels)

In [32]:
# Set parameters
# input_size = 2 * d_model
hidden_size = 324
output_size = 1
learning_rate = 2e-5
num_epochs = 300
batch_size = 32

pretrained_model = BERT(n_layers)
#pretrained_model.load_state_dict(torch.load('oo_no_pos_pretrained.dat'))
pretrained_model.train()
# pretrained_model.eval()
pretrained_model.to(device)

# Instantiate the model, loss function, and optimizer
MLP_model = MLP(pretrained_model, d_model, hidden_size, output_size, dropout_rate=0.1)
criterion = nn.BCELoss()
optimizer = Adam(MLP_model.parameters(), lr=learning_rate)

# Move model to device
MLP_model = MLP_model.to(device)
MLP_model.train()

# Prepare the data
train_inputs, train_labels = prepare_data(train_labeled_lists)
test_inputs, test_labels = prepare_data(test_labeled_lists)

train_inputs, train_labels = train_inputs.to(device), train_labels.to(device)
test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)

# Create DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [33]:
MLP_model.train()
MLP_model.bert.train()

for epoch in range(num_epochs):
    # Create tqdm progress bar
    pbar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', dynamic_ncols=True)

    for inputs, labels in pbar:
        optimizer.zero_grad()
        
        segments = torch.tensor([[0 for _ in i] for i in inputs])
        masked_poses = torch.tensor([[0 for _ in range(max_pred)] for i in inputs])
        
        inputs, labels = inputs.to(device), labels.to(device)
        segments, masked_poses = segments.to(device), masked_poses.to(device)
        
        outputs = MLP_model(inputs, segments, masked_poses)
        loss = criterion(outputs, labels.unsqueeze(1).float())
        loss.backward()
        optimizer.step()

        # Update tqdm with the current loss
        pbar.set_postfix(loss=loss.item())


Epoch 1/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 63.51it/s, loss=0.524]
Epoch 2/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 67.94it/s, loss=0.195]
Epoch 3/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 60.68it/s, loss=0.153]
Epoch 4/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 63.03it/s, loss=0.326]
Epoch 5/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 66.48it/s, loss=0.103]
Epoch 6/300: 100%|████████████████████████████████████████████████████████| 418/418 [00:05<00:00, 69.97it/s, loss=0.39]
Epoch 7/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 63.59it/s, loss=0.213]
Epoch 8/300: 100%|███████████████████████████████████████████████████████| 418/418 [00:06<00:00, 63.68it/s, loss=0.113]
Epoch 9/300: 100%|██████████████████████

In [46]:
segments = torch.tensor([[0 for _ in i] for i in train_inputs])
masked_poses = torch.tensor([[0 for _ in range(max_pred)] for i in train_inputs])

segments = segments.to(device)
masked_poses = masked_poses.to(device)

with torch.no_grad():
    train_outputs = MLP_model(train_inputs, segments, masked_poses)
    predictions = (train_outputs > 0.5).float().cpu().numpy()
    train_labels_numpy = train_labels.cpu().numpy()

# Convert predictions to binary (0 or 1)
predictions_binary = (predictions > 0.5).astype(int)

# Compute metrics
accuracy = accuracy_score(train_labels_numpy, predictions_binary)
precision = precision_score(train_labels_numpy, predictions_binary)
recall = recall_score(train_labels_numpy, predictions_binary)
f1 = f1_score(train_labels_numpy, predictions_binary)
auc = roc_auc_score(train_labels_numpy, train_outputs.cpu().numpy())

# Print the results
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'AUC: {auc:.3f}')

Accuracy: 1.000
Precision: 0.999
Recall: 1.000
F1 Score: 1.000
AUC: 1.000


In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score

# ... (previous code)
MLP_model.eval()
MLP_model.bert.eval()

segments = torch.tensor([[0 for _ in i] for i in test_inputs])
masked_poses = torch.tensor([[0 for _ in range(max_pred)] for i in test_inputs])

segments = segments.to(device)
masked_poses = masked_poses.to(device)

# Evaluate the model on the test set
with torch.no_grad():
    test_outputs = MLP_model(test_inputs, segments, masked_poses)
    predictions = (test_outputs > 0.01).float().cpu().numpy()
    test_labels_numpy = test_labels.cpu().numpy()

# Convert predictions to binary (0 or 1)
predictions_binary = (predictions > 0.01).astype(int)

# Compute metrics
accuracy = accuracy_score(test_labels_numpy, predictions_binary)
precision = precision_score(test_labels_numpy, predictions_binary)
recall = recall_score(test_labels_numpy, predictions_binary)
f1 = f1_score(test_labels_numpy, predictions_binary)
auc = roc_auc_score(test_labels_numpy, test_outputs.cpu().numpy())
aupr = average_precision_score(test_labels_numpy, test_outputs.cpu().numpy())

# Print the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'AUC: {auc:.4f}')
print(f'AUPR: {aupr:.4f}')

Accuracy: 0.7654
Precision: 0.8455
Recall: 0.6495
F1 Score: 0.7347
AUC: 0.8663
AUPR: 0.8578
