<a href="https://colab.research.google.com/github/JamieGal/GeoLit_KG-BERT-T/blob/master/DistMult.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DistMult

The following code imports the GeoLit train, dev and test data from Google Drive, and runs DistMult grid search model on Ampligraph. 

Parameters and outputs used for project report have been saved.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
! pip install ampligraph

In [None]:
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
# Import GeoLit dataset and split into subject, predicate, object as required by the DistMult model.

train_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/train.tsv"
valid_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/dev.tsv"
test_data = "/content/drive/My Drive/PROJECT/kg_bert_baseline_data/geolit_data/test.tsv"

train = pd.read_csv(train_data, sep="\t", header=None)
valid = pd.read_csv(valid_data, sep="\t", header=None)
test = pd.read_csv(test_data, sep="\t", header=None)

train.columns = ['subject', 'predicate', 'object']
valid.columns = ['subject', 'predicate', 'object']
test.columns = ['subject', 'predicate', 'object']

print(train.shape)
print(valid.shape)
print(test.shape)

train = train.to_numpy()
valid = valid.to_numpy()
test = test.to_numpy()

(643, 3)
(71, 3)
(179, 3)


In [None]:
from ampligraph.latent_features import DistMult
from ampligraph.evaluation import select_best_model_ranking

# Set model
model_class = DistMult

# Set parameters for grid search
param_grid = {"batches_count": [50, 100, 150],
              "seed": 42,
              "epochs": [2, 3, 4, 5, 7],
              "k": [20, 60, 100, 150],
              "eta": [1, 5],
              "loss": ["multiclass_nll"],
              "initializer": ['xavier'], 
              "initializer_params": {'uniform': False},
              "regularizer": ["LP"],
              "regularizer_params": {
                  "p": [2],
                  "lambda": [1e-3]},
              "optimizer": ["adam"],
              "optimizer_params":{
                  "lr": 0.001 },
              "verbose": False}

#Set early stopping on MRR value, as recommneded
early_stopping_params = { 'x_valid': valid,       # Validation set on which early stopping will be performed
                          'criteria': 'mrr',      # metric to watch during early stopping
                          #'burn_in': 150,         # Burn in time, i.e. early stopping checks will not be performed till 150 epochs
                          'check_interval': 1,   # After burn in time, early stopping checks will be performed at every 50th epochs (i.e. 150, 200, 250, ...)
                          'stop_interval': 2,     # If the monitored criteria degrades for these many epochs, the training stops. 
                          'corrupt_side':'s,o'}   # Which sides to corrupt during early stopping evaluation (default both subject and obj as described earlier)

# select_best_model_ranking runs the grid search and outputs evaluation metrics
best_model, best_params, best_mrr_train, ranks_test, mrr_test, experimental_history = \
                          select_best_model_ranking(model_class, 
                          train, 
                          valid, 
                          test, 
                          param_grid,
                          use_filter=True, 
                          verbose=True,
                          early_stopping=True,
                          early_stopping_params=early_stopping_params)

120it [04:02,  2.02s/it]


## Results and optimal hyperparameters below

In [None]:
print('MRR of the best model:', best_mrr_train)

MRR of the best model: 0.4160430738434622


In [None]:
print('Results for optimized parameters:', mrr_test)

Results for optimized parameters: {'mrr': 0.3729271836263001, 'mr': 26.33240223463687, 'hits_1': 0.25139664804469275, 'hits_3': 0.43854748603351956, 'hits_10': 0.5782122905027933}


In [None]:
# params of the best model
best_params

{'batches_count': 100,
 'epochs': 7,
 'eta': 5,
 'initializer': 'xavier',
 'initializer_params': {'uniform': False},
 'k': 150,
 'loss': 'multiclass_nll',
 'optimizer': 'adam',
 'optimizer_params': {'lr': 0.001},
 'regularizer': 'LP',
 'regularizer_params': {'lambda': 0.001, 'p': 2},
 'seed': 42,
 'verbose': False}

In [None]:
metrics = pd.DataFrame.from_dict(experimental_history, orient='columns')
params = metrics['model_params'].apply(pd.Series)
results = metrics['results'].apply(pd.Series)
metrics = pd.concat([params, results], axis=1)
metrics

Unnamed: 0,batches_count,seed,epochs,k,eta,loss,initializer,regularizer,optimizer,verbose,optimizer_params,regularizer_params,initializer_params,mrr,mr,hits_1,hits_3,hits_10
0,50,42,2,20,1,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.014167,137.936620,0.000000,0.000000,0.014085
1,50,42,2,20,5,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.013354,142.295775,0.000000,0.000000,0.014085
2,50,42,2,60,1,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.011057,152.570423,0.000000,0.000000,0.007042
3,50,42,2,60,5,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.009324,157.288732,0.000000,0.000000,0.000000
4,50,42,2,100,1,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.023105,131.626761,0.000000,0.007042,0.056338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,150,42,7,60,5,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.321824,33.978873,0.190141,0.408451,0.549296
116,150,42,7,100,1,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.267397,55.626761,0.197183,0.302817,0.373239
117,150,42,7,100,5,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.405845,27.176056,0.309859,0.443662,0.570423
118,150,42,7,150,1,multiclass_nll,xavier,LP,adam,False,{'lr': 0.001},"{'p': 2, 'lambda': 0.001}",{'uniform': False},0.271317,50.528169,0.183099,0.309859,0.387324


In [None]:
# Export results

from google.colab import files
metrics.to_csv('DistMult_results.csv') 
files.download('DistMult_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>