# Import packages:

In [1]:
from openbiolink.obl2021 import OBL2021Dataset
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
import torch.utils.data
import torch
import math

D:\python38\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
D:\python38\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


# Set Manual Seed:

In [2]:
torch.manual_seed(2022)
torch.cuda.manual_seed_all(2022)

# Accessing Data:

In [3]:
obl_dataset = OBL2021Dataset()

Dataset found in D:\programming repo-1\university of glasgow\msc project\MSc.Project\obl2021, omitting download...


In [4]:
# get the training, validation and test sets

#obl_train_dataset = obl_dataset.training
#obl_valid_dataset = obl_dataset.validation
#obl_test_dataset = obl_dataset.testing

In [5]:
#checking out the tensor shape
print('The shape of the training tensor is : ', obl_dataset.training.shape)
print('The shape of the validation tensor is : ', obl_dataset.validation.shape)
print('The shape of the testing tensor is : ', obl_dataset.testing.shape)

#checking out the tensor dtype
print('The data type of the training tensor is : ', obl_dataset.training.dtype)
print('The data type of the validation tensor is : ', obl_dataset.validation.dtype)
print('The data type of the testing tensor is : ', obl_dataset.testing.dtype)

torch.max(obl_dataset.training[:,0]), torch.max(obl_dataset.training[:,2]), torch.max(obl_dataset.validation[:,0]), torch.max(obl_dataset.validation[:,2])
torch.min(obl_dataset.training[:,0]), torch.min(obl_dataset.training[:,2]), torch.min(obl_dataset.validation[:,0]), torch.min(obl_dataset.validation[:,2])

The shape of the training tensor is :  torch.Size([4192002, 3])
The shape of the validation tensor is :  torch.Size([186301, 3])
The shape of the testing tensor is :  torch.Size([180964, 3])
The data type of the training tensor is :  torch.int64
The data type of the validation tensor is :  torch.int64
The data type of the testing tensor is :  torch.int64


(tensor(1), tensor(0), tensor(42), tensor(2))

# Embedding Models:

## TransE:

In [6]:
class TransE(nn.Module):
    def __init__(self, device, num_entity, num_relation, emb_dim, gamma):
        super(TransE, self).__init__()
        self.device = device
        self.emb_dim = emb_dim
        self.num_entity = num_entity
        self.num_relation = num_relation
        
        #initialize entity embeddings
        self.entity_emb = self.initialize_emb(self.num_entity, self.emb_dim)

        #initialie relation embeddings
        self.relation_emb = self.initialize_emb(self.num_relation, self.emb_dim)
        self.relation_emb.weight.data.div_(self.relation_emb.weight.data.norm(p=2, dim=1, keepdim=True))
        #create the loss function
        self.loss_fn = nn.MarginRankingLoss(margin=gamma)
        

    def initialize_emb(self, num_emb, emb_dim):
        emb_weight_range = 6 / math.sqrt(emb_dim)
        emb = nn.Embedding(num_embeddings=num_emb, embedding_dim=emb_dim, device=self.device)
        emb.weight.data.uniform_( -emb_weight_range, emb_weight_range )
        return emb
    
    def forward(self, pos_triplet, neg_triplet):
        self.entity_emb.weight.data.div_(self.entity_emb.weight.data.norm(p=2, dim=1, keepdim=True))
        pos_distance = self.cal_distance(pos_triplet)
        neg_distance = self.cal_distance(neg_triplet)
        return self.loss_fn(pos_distance, neg_distance, torch.tensor([-1], dtype=torch.int64, device=self.device))
    
    def cal_distance(self, triplet):
        head = triplet[:,0]
        relation = triplet[:,1]
        tail = triplet[:,2]
        #print(head.shape)
        return (self.entity_emb(head) + self.relation_emb(relation) - self.entity_emb(tail)).norm(p=2, dim=1)
       
        

In [7]:
#test transe_model, remove later
#transe_model = TransE(device = 'cuda', num_entity = 100, num_relation = 10, emb_dim = 100, gamma = 0.01)

### Create Dataset:

In [8]:
'''
class Create_dataset(torch.utils.data.IterableDataset):
    def __init__(self, data):
        self.data = data
        self.size = data.size()
    
    def __iter__(self):
        return iter(self.data)
    
    def size(self):
        return self.size

obl_train_dataset = TensorDataset(obl_dataset.training)
obl_valid_dataset = TensorDataset(obl_dataset.validation)
obl_test_dataset = TensorDataset(obl_dataset.testing)

dataset = Create_dataset(obl_dataset.training)

print(type(obl_dataset.training.shape[0]))
print(obl_dataset.training.size())
'''

'''
count = 0
for i in dataset:
    print(i)
    count +=1
    if count == 10:
        break
'''

'\ncount = 0\nfor i in dataset:\n    print(i)\n    count +=1\n    if count == 10:\n        break\n'

### Create DataLoader:

In [9]:
def create_dataloader(dataset, batch_size, shuffle):
    tensor_dataset = TensorDataset(dataset)
    return DataLoader(tensor_dataset, batch_size=batch_size, shuffle=shuffle) 

### Sample Corrupted Triplet:

In [10]:
def create_corr_triplet(num_entity, sample_data):
    corr_triplet = sample_data.clone().detach()
    head_or_tail = torch.randint( 0, 2, (1,))
    if head_or_tail == 0:
        corr_triplet[:,0] = torch.randint(0, num_entity, (sample_data.shape[0],))
    else:
        corr_triplet[:,2] = torch.randint(0, num_entity, (sample_data.shape[0],))
    return corr_triplet

### Training Function:

In [11]:
def train_transe(model, data_loader, optimizer, epoch, num_entity, device):
    for i in range(1, epoch+1):
        for index, batch_data in enumerate(data_loader):  
            #print(len(batch_data))
            #print(batch_data[0][0])
            #break
            sample_data = batch_data[0]
            corr_sample_data = create_corr_triplet(num_entity=num_entity, sample_data=sample_data)
            sample_data = sample_data.to(device)
            corr_sample_data = corr_sample_data.to(device)
            loss = model(sample_data, corr_sample_data)
            loss = loss.mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(i, 'epoch is done')
        print('Average Training loss is: ', loss)
        #evaluate_model()

### Evaluation Function:

In [12]:
def evaluate_model(model, num_entity, dataset, device):
    #dataset_score_tensor = torch.zeros(len(dataset), num_entity)
    tail_mr_score = torch.tensor(0.0, dtype=torch.float64)
    tail_mrr_score = torch.tensor(0.0, dtype=torch.float64)
    for triplet in dataset:
        original_tail = triplet[0][2]
        triplet = triplet[0].reshape(1,3).to(device)
        score_tensor = torch.zeros(num_entity)
        for i in range(num_entity):
            triplet[:,2] = torch.tensor(i)
            score = model.cal_distance(triplet)
            #print(score.shape)
            score_tensor[i] = score.cpu()
        sorted_score_tensor, index_score_tensor = torch.sort(score_tensor)
        del score_tensor
        del sorted_score_tensor
        tail_rank = (index_score_tensor == original_tail).nonzero()[0,0]
        del index_score_tensor
        tail_rank += torch.tensor(1)
        tail_mrr_score += torch.tensor(1.0) / tail_rank
        tail_mr_score += tail_rank
    tail_mr_score = tail_mr_score / torch.tensor(len(datset))
    tail_mrr_score = tail_mrr_score / torch.tensor(len(dataset))
    print('Mean Rank is for tail prediction is: ', tail_mr_score)

            
        

### Set parameters and call train TransE:

In [13]:
select_train_model = input('Train a new model?(y/n) : ')

if select_train_model == 'y':
    
    transe_model = TransE(device = 'cuda', num_entity = 184732, num_relation = 28, emb_dim = 50, gamma = 0.01)
    optimizer = torch.optim.SGD(transe_model.parameters(), lr=0.01)

    data_loader = create_dataloader(obl_dataset.training, batch_size=512, shuffle=True)
    train_transe(model=transe_model, data_loader=data_loader, optimizer=optimizer, epoch=20, num_entity=184732, device='cuda')
    
    select_save_model = input('Save model ?(y/n) :')
    if select_save_model == 'y':
        torch.save(transe_model, 'transe_model.pt')
        print('Saving done!')
        
elif select_train_model == 'n':
    print('Loading a TransE model from disk...')
    transe_model = torch.load('transe_model.pt')
    print('Done!')
    
    

Train a new model?(y/n) :  n


Loading a TransE model from disk...
Done!


In [14]:
evaluate_model(model=transe_model, num_entity=184732, dataset=TensorDataset(obl_dataset.validation), device='cuda')

KeyboardInterrupt: 