In [1]:
import pandas as pd
import numpy as np
import json
import csv
import matplotlib.pyplot as plt
import warnings
import pickle
import os
import time

from gensim.models import KeyedVectors
from ampligraph.latent_features import ComplEx, save_model, restore_model

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, cross_val_score, cross_validate, cross_val_predict

from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier    
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import f1_score, accuracy_score, recall_score, cohen_kappa_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier

  from numpy.core.umath_tests import inner1d


In [11]:
# Loading triplets and labels for training and test sets used for embedding
trainData = np.load("dev/trainDev.npy")
testData = np.load("dev/valid.npy")
Y_train = trainData[:,-1].astype(float)
Y_test = testData[:,-1].astype(float)
print(Y_test)

[1. 0. 0. ... 1. 1. 1.]


In [12]:
# Load models
models = []
models_names = []

# DIRECTORIES NAME OF EMBEDDING MODELS
ampligraph_models_dn = "./dev/ampligraphModels"
RDF2VEC_models_dn = "./dev/RDF2VECModels"

# Load Ampligraph model
for fname in os.listdir(ampligraph_models_dn):
    models_names.append(fname)
    models.append(restore_model(os.path.join(ampligraph_models_dn,fname)))
    
#Load RDF2VEC models
for fname in os.listdir(RDF2VEC_models_dn):
    models_names.append(fname)
    models.append(KeyedVectors.load(os.path.join(RDF2VEC_models_dn,fname)))

2019-06-17 10:39:37,083 - gensim.utils - INFO - loading Word2VecKeyedVectors object from ./dev/RDF2VECModels/RDF2Vec_sg_WalksData-2_300_10_8_1neg_wv
2019-06-17 10:39:37,200 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2019-06-17 10:39:37,202 - gensim.utils - INFO - loaded ./dev/RDF2VECModels/RDF2Vec_sg_WalksData-2_300_10_8_1neg_wv
2019-06-17 10:39:37,202 - gensim.utils - INFO - loading Word2VecKeyedVectors object from ./dev/RDF2VECModels/RDF2Vec_sg_WalksData-2_150_10_8_10neg_wv
2019-06-17 10:39:37,271 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2019-06-17 10:39:37,272 - gensim.utils - INFO - loaded ./dev/RDF2VECModels/RDF2Vec_sg_WalksData-2_150_10_8_10neg_wv
2019-06-17 10:39:37,273 - gensim.utils - INFO - loading Word2VecKeyedVectors object from ./dev/RDF2VECModels/RDF2Vec_sg_WalksData-2_300_10_8_10neg_wv
2019-06-17 10:39:37,390 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2019-06-17 10:39:37,391 - gensim.ut

In [13]:
# Convert triplets to embedding model transR from json dict mapping
def triplets_to_vec_TransR(triplets_arr, model_ents, model_rels):
    X = []
    for triplet in triplets_arr:
        s,p,o = triplet[:-1]
        # the URIS in the dict does not have chevrons
        es = model_ents[s[1:-1]]
        ep = model_rels[p[1:-1]]
        eo = model_ents[o[1:-1]]
        embedding = np.concatenate((es, ep, eo)).flatten()
        X.append(embedding)
    X = np.array(X)
    return X

In [14]:
# Convert triplets from numpy array into embedding space of model (to apply ML models), for RDF2VEC model
def triplets_to_vec_RDF2VEC(triplets_arr, model):
    X = []
    for triplet in triplets_arr:
        s,p,o = triplet[:-1]
        es = model.get_vector(s)
        ep = model.get_vector(p)
        eo = model.get_vector(o)
        embedding = np.concatenate((es, ep, eo)).flatten()
        X.append(embedding)
    X = np.array(X)
    return X

# Same for ampligraph models
def triplets_to_vec_Ampligraph(triplets_arr, model):
    X = []
    for triplet in triplets_arr:
        s,p,o = triplet[:-1]
        eso = model.get_embeddings([s,o], type='entity')
        ep = model.get_embeddings([p], type='relation')
        embedding = np.concatenate((eso,ep)).flatten()
#         print(eso.shape)
#         print(ep.shape)
#         print(embedding.shape)
        X.append(embedding)
    X = np.array(X)
    return X

In [15]:
print(models_names)

['HolE_150_200_1', 'ComplEx_150_200_1', 'TransE_150_200_1', 'DistMult_150_200_1', 'RDF2Vec_sg_WalksData-2_300_10_8_1neg_wv', 'RDF2Vec_sg_WalksData-2_150_10_8_10neg_wv', 'RDF2Vec_sg_WalksData-2_300_10_8_10neg_wv', 'RDF2Vec_sg_WalksData-2_150_10_8_1neg_wv', 'RDF2Vec_sg_WalksData_300_200_8_wv']


In [16]:
# Get numeric data of all models for Hold Out Scheme
all_Xtrain = []
all_Xtest = []
for mod,name in zip(models, models_names):
    # RDF2VEC models
    if "RDF2Vec" in name:
        X_train = triplets_to_vec_RDF2VEC(trainData, mod).astype(float)
        X_test = triplets_to_vec_RDF2VEC(testData, mod).astype(float)
    # Ampligraph models
    else:
        X_train = triplets_to_vec_Ampligraph(trainData, mod).astype(float)
        #print(name)
        #print(X_train.shape)
        X_test = triplets_to_vec_Ampligraph(testData, mod).astype(float)
    all_Xtrain.append(X_train)
    all_Xtest.append(X_test)

In [24]:
# Hold out Scheme
def run_classifiers_HO(clfs,Xtrain,Ytrain, Xtest, Ytest, name):
    scoring = ['accuracy', "roc_auc"]
    fieldnames = ['Algorithm', 'AUC', "Accuracy", "Time"]
    
    # write header of csv result file
#     with open('./results/' + name + ".csv", 'w', newline='') as csvfile:
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
    # Classifiers loop
    for i in clfs:
        try:
            clf = clfs[i]
            print("\n\n======= {0} =======".format(i))
            start = time.time()
            clf.fit(Xtrain, Ytrain)
            end = time.time()
            timing = end - start
            y_pred = clf.predict(Xtest)
            auc = roc_auc_score(Ytest, y_pred)
            acc = accuracy_score(Ytest, y_pred)
            
            print("execution time : ", timing)
            print("AUC : ", auc)
            print("accuracy : ", acc)
            
            results = {"Algorithm" : i, "Time": timing, "Accuracy": acc, "AUC": auc}
            
            # Write result in file
#             with open('./results/' + name + ".csv", 'w', newline='') as csvfile:
#                 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#                 writer.writerow(results)
            
            # Saving results to file
#             w = csv.writer(open("./results/" + name + "_" + i + ".csv", "w"))
#             for key, val in results.items():
#                 w.writerow([key, val])
            
        except Exception as e:
            print(e)

In [31]:
clfs_best = {
'RF': RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1),
'MLP': MLPClassifier(solver='adam',alpha=1e-4,hidden_layer_sizes=(300,100),random_state=1),
}

In [32]:
# Hold out scheme
for i in range(len(all_Xtrain)):
    print("\n\n\n>>>>>>>>>>>> {0} <<<<<<<<<<<<".format(models_names[i]))
    run_classifiers_HO(clfs_best, all_Xtrain[i], Y_train, all_Xtest[i], Y_test, models_names[i])




>>>>>>>>>>>> HolE_150_200_1 <<<<<<<<<<<<


execution time :  11.990300416946411
AUC :  0.8567827460677245
accuracy :  0.8594115680374457


execution time :  49.17029547691345
AUC :  0.8277663301050897
accuracy :  0.839518555667001



>>>>>>>>>>>> ComplEx_150_200_1 <<<<<<<<<<<<


execution time :  14.143194437026978
AUC :  0.8550257572635482
accuracy :  0.8555667001003009


execution time :  51.9817271232605
AUC :  0.8250504842365547
accuracy :  0.8254764292878636



>>>>>>>>>>>> TransE_150_200_1 <<<<<<<<<<<<


execution time :  6.384512901306152
AUC :  0.8482344254413077
accuracy :  0.8453694416583083


execution time :  46.03837180137634
AUC :  0.823463149941617
accuracy :  0.8263122701437646



>>>>>>>>>>>> DistMult_150_200_1 <<<<<<<<<<<<


execution time :  9.326711416244507
AUC :  0.8579387320557731
accuracy :  0.8594115680374457


execution time :  34.55751180648804
AUC :  0.8210217047874167
accuracy :  0.8321631561350719



>>>>>>>>>>>> RDF2Vec_sg_WalksData-2_300_10_8_1neg_wv 

In [33]:
# TRANSR model
# Load TransR model (07/06/19)
TransR_model_ents = "dev/transRmodel/entities_to_embeddings.json"
TransR_model_rels = "dev/transRmodel/relations_to_embeddings.json"

model_TransR_ents = json.load(open(TransR_model_ents))
model_TransR_rels = json.load(open(TransR_model_rels))

In [34]:
X_train_transR = triplets_to_vec_TransR(trainData, model_TransR_ents, model_TransR_rels)
X_test_transR = triplets_to_vec_TransR(testData, model_TransR_ents, model_TransR_rels)

In [35]:
run_classifiers_HO(clfs_best, X_train_transR, Y_train, X_test_transR, Y_test, "TransR")



execution time :  6.051275253295898
AUC :  0.8319788447008722
accuracy :  0.8276496155132063


execution time :  22.464711904525757
AUC :  0.7549814547702453
accuracy :  0.7587763289869609
