<a href="https://colab.research.google.com/github/JamieGal/GeoLit_KG-BERT-T/blob/master/TransE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TransE

The following code imports the GeoLit train, dev and test data from Google Drive, and runs TransE grid search model on Ampligraph. 

Parameters and outputs used for project report have been saved.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
! pip install ampligraph

In [None]:
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
# Import GeoLit dataset and split into subject, predicate, object as required by the TransE model.

train_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/train.tsv"
valid_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/dev.tsv"
test_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/test.tsv"

train = pd.read_csv(train_data, sep="\t", header=None)
valid = pd.read_csv(valid_data, sep="\t", header=None)
test = pd.read_csv(test_data, sep="\t", header=None)

train.columns = ['subject', 'predicate', 'object']
valid.columns = ['subject', 'predicate', 'object']
test.columns = ['subject', 'predicate', 'object']

print(train.shape)
print(valid.shape)
print(test.shape)

train = train.to_numpy()
valid = valid.to_numpy()
test = test.to_numpy()

(643, 3)
(71, 3)
(179, 3)


In [None]:
from ampligraph.latent_features import TransE
from ampligraph.evaluation import select_best_model_ranking

# Set model
model_class = TransE

# Set parameters for grid search
param_grid = {"batches_count": [50, 100, 150],
                  "seed": 42,
                  "epochs": [2, 3, 4, 5, 7],
                  "k": [20, 60, 100, 150],
                  "eta": [1, 5],
                  "loss": ["pairwise"],
                  "loss_params": {},
                  "embedding_model_params": {},
                  "regularizer": ["LP"],
                  "regularizer_params": {"p": [2], "lambda": [1e-3]},
                  "optimizer": ["adam"],
                  "optimizer_params":{"lr": 1e-3},
                  "verbose": False}


#Set early stopping on MRR value, as recommneded
early_stopping_params = { 'x_valid': valid,       # Validation set on which early stopping will be performed
                          'criteria': 'mrr',      # Metric to watch during early stopping
                          'check_interval': 1,    
                          'stop_interval': 2,     
                          'corrupt_side':'s,o'}   # Which sides to corrupt during early stopping evaluation 

# select_best_model_ranking runs the grid search and outputs evaluation metrics
best_model, best_params, best_mrr_train, ranks_test, mrr_test, experimental_history = \
        select_best_model_ranking(model_class, 
                          train, 
                          valid, 
                          test, 
                          param_grid,
                          use_filter=True, 
                          verbose=True,
                          early_stopping=True,
                          early_stopping_params = early_stopping_params)

120it [04:25,  2.21s/it]


## Results and optimal hyperparameters below

In [None]:
print('MRR of the best model:', best_mrr_train)

MRR of the best model: 0.3147463562392595


In [None]:
print('Results for optimized parameters:', mrr_test)

Results for optimized parameters: {'mrr': 0.29910361943484215, 'mr': 36.86871508379888, 'hits_1': 0.19553072625698323, 'hits_3': 0.35195530726256985, 'hits_10': 0.48324022346368717}


In [None]:
# params of the best model
best_params

{'batches_count': 150,
 'epochs': 7,
 'eta': 5,
 'k': 20,
 'loss': 'pairwise',
 'optimizer': 'adam',
 'optimizer_params': {'lr': 0.001},
 'regularizer': 'LP',
 'regularizer_params': {'lambda': 0.001, 'p': 2},
 'seed': 42,
 'verbose': False}

In [None]:
results = pd.DataFrame.from_dict(experimental_history, orient='columns')
params = results['model_params'].apply(pd.Series)
results = results['results'].apply(pd.Series)
results = pd.concat([params, results], axis=1)
results

Unnamed: 0,batches_count,seed,epochs,k,eta,loss,regularizer,optimizer,verbose,optimizer_params,regularizer_params,mrr,mr,hits_1,hits_3,hits_10
0,50,42,2,20,1,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.182371,97.669014,0.161972,0.169014,0.190141
1,50,42,2,20,5,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.177799,86.161972,0.147887,0.161972,0.211268
2,50,42,2,60,1,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.162400,79.415493,0.098592,0.176056,0.253521
3,50,42,2,60,5,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.203892,63.661972,0.119718,0.232394,0.366197
4,50,42,2,100,1,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.148905,80.760563,0.056338,0.176056,0.330986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,150,42,7,60,5,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.271030,41.718310,0.154930,0.345070,0.464789
116,150,42,7,100,1,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.246557,50.704225,0.126761,0.316901,0.450704
117,150,42,7,100,5,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.215573,45.197183,0.070423,0.295775,0.471831
118,150,42,7,150,1,pairwise,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",0.194443,56.042254,0.105634,0.225352,0.380282


In [None]:
# Export results

from google.colab import files
results.to_csv('TransE_results.csv') 
files.download('TransE_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>