In [28]:
import pandas as pd
import numpy as np
import json
import csv
import matplotlib.pyplot as plt
import warnings
import pickle
import os
import time

from gensim.models import KeyedVectors
from ampligraph.latent_features import ComplEx, save_model, restore_model

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, cross_val_score, cross_validate, cross_val_predict

from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier    
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import f1_score, accuracy_score, recall_score, cohen_kappa_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight

import random
random.seed(0)
np.random.seed(0)

# CLASSIFICATION OF THE NEWLY RELEASED TEST SET, WITH THE TRAINING OF THE FULL TRAIN DATA SET
# (25k statements with duplicates)

In [29]:
# Loading triplets and labels
trainData = np.load("test/train.npy")
Y = trainData[:,-1]
Y = Y.astype(float)

In [30]:
trainData.shape

(23928, 4)

In [31]:
# Load all test triplets
testData = np.load("test/test.npy")
testData.shape

(25000, 4)

In [32]:
# Load models
models = []
models_names = []
RDF2VEC_models_dn = "test/RDF2VECModels"
#RDF2VEC_models_dn = "./RDF2VEC/fulldata-20epochs"
#RDF2VEC_models_dn = "./RDF2VEC/fulldata-200dims"

# Load Ampligraph model
# for fname in os.listdir(ampligraph_models_dn):
#     models_names.append(fname)
#     models.append(restore_model(os.path.join(ampligraph_models_dn,fname)))
    
#Load RDF2VEC models
for fname in os.listdir(RDF2VEC_models_dn):
    models_names.append(fname)
    models.append(KeyedVectors.load(os.path.join(RDF2VEC_models_dn,fname)))


2019-06-17 11:55:55,483 - gensim.utils - INFO - loading Word2VecKeyedVectors object from test/RDF2VECModels/RDF2Vec_sg_fulldata-2_300_10_8_wv
2019-06-17 11:55:56,104 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2019-06-17 11:55:56,105 - gensim.utils - INFO - loaded test/RDF2VECModels/RDF2Vec_sg_fulldata-2_300_10_8_wv
2019-06-17 11:55:56,106 - gensim.utils - INFO - loading Word2VecKeyedVectors object from test/RDF2VECModels/RDF2Vec_sg_fulldata-2_150_10_8_wv
2019-06-17 11:55:56,320 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2019-06-17 11:55:56,322 - gensim.utils - INFO - loaded test/RDF2VECModels/RDF2Vec_sg_fulldata-2_150_10_8_wv


In [33]:
# Convert triplets from numpy array into embedding space of model (to apply ML models), for RDF2VEC model
def triplets_to_vec_RDF2VEC(triplets_arr, model):
    X = []
    for triplet in triplets_arr:
        s,p,o = triplet[:-1]
        es = model.get_vector(s)
        ep = model.get_vector(p)
        eo = model.get_vector(o)
        embedding = np.concatenate((es, ep, eo)).flatten()
        X.append(embedding)
    X = np.array(X)
    return X

# Same for ampligraph models
def triplets_to_vec_Ampligraph(triplets_arr, model):
    X = []
    for triplet in triplets_arr:
        s,p,o = triplet[:-1]
        eso = model.get_embeddings([s,o], type='entity')
        ep = model.get_embeddings([p], type='relation')
        embedding = np.concatenate((eso,ep)).flatten()
        X.append(embedding)
    X = np.array(X)
    return X

In [34]:
print(models_names)

['RDF2Vec_sg_fulldata-2_300_10_8_wv', 'RDF2Vec_sg_fulldata-2_150_10_8_wv']


In [35]:
# Get numeric data of all models for Hold Out Scheme
all_Xtrain = []
all_Xtest = []
for mod,name in zip(models, models_names):
    # RDF2VEC models
    if "RDF2Vec" in name:
        X_train = triplets_to_vec_RDF2VEC(trainData, mod).astype(float)
        X_test = triplets_to_vec_RDF2VEC(testData, mod).astype(float)
    # Ampligraph models
    else:
        X_train = triplets_to_vec_Ampligraph(trainData, mod).astype(float)
        X_test = triplets_to_vec_Ampligraph(testData, mod).astype(float)
    all_Xtrain.append(X_train)
    all_Xtest.append(X_test)

In [36]:
# Export prediction of statements in the challenge format
def export_results(y_pred, data, fn):
    output = ""
    for score, statement in zip(y_pred, data):
        line = statement + " <http://swc2017.aksw.org/hasTruthValue> " + "\"" + str(score) + "\"^^<http://www.w3.org/2001/XMLSchema#double> .\n"
        output = output + line
    with open(fn, "w") as f:
        f.write(output)

In [37]:
# BEST MLP CLASSIFIER, after param search on valid set
mlp_best = MLPClassifier(solver='adam', alpha=0.0001, hidden_layer_sizes=(300,100),random_state=1)

In [38]:
mlp_best.fit(all_Xtrain[0], Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [39]:
y_pred_TEST = mlp_best.predict_proba(all_Xtest[0])
export_results(y_pred_TEST[:,1], testData[:,3], "Results_MLPbest_rdf2vec_300_10_8.nt")