In [1]:
from sklearn.ensemble import *
import sklearn.metrics
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np



In [48]:
def load_file_data(path):
    file = open(path, 'r')
    data = []
    for line in file:
        data.append(line.strip("\n").split())
    return data

def load_embedding(path, skipFirst = False):
    file = open(path, 'r')
    ids = []
    data = []
    for line in file:
        if skipFirst:
            skipFirst = False
            continue
        embedding = line.strip("\n").split()
        data.append(embedding)
    return np.array(data).astype('float64')

def convert_ids(ids, lookup_current, lookup_replace):
    """Replaces TransE IDs with our IDs because we use a different mapping.""" 
    # Replace
    new_ids = []
    for i in ids:
        new_ids.append(lookup_replace[lookup_current[str(i)]])
    return new_ids

In [3]:
dataset = 'fb15k'

In [4]:
entity_mapping = load_file_data('../data/'+dataset+'/entity_mapping.txt')
rel_mapping = load_file_data('../data/'+dataset+'/relation_mapping.txt')
sme_mapping = load_file_data('../data/'+dataset+'/id2entity.sme.txt')

In [37]:
our_mapping = dict(entity_mapping + rel_mapping)
sme_mapping = dict(sme_mapping)

In [145]:
train = load_file_data('../data/'+dataset+'/train.txt')
test = load_file_data('../data/'+dataset+'/test.txt')
valid = load_file_data('../data/'+dataset+'/valid.txt')

triples = np.concatenate([train, test, valid])

embeddings = load_embedding('../data/'+dataset+'/embeddings.transe.txt')

In [146]:
new_ids = convert_ids(np.arange(0, len(embeddings)), sme_mapping, our_mapping)
new_ids = np.reshape(np.array(new_ids), (-1, 1))
ent_flag = np.concatenate([np.ones((len(entity_mapping), 1)), np.zeros((len(rel_mapping), 1))])

In [147]:
# Add our ids to the embeddings and a flag whether it is an entitiy or relation embedding
embeddings = np.concatenate([new_ids, ent_flag, embeddings], axis=1)

In [153]:
def get_embedding(embedding_frame, idx, entity = 1):
    return embedding_frame[embedding_frame[0] == idx][embedding_frame[1] == entity].values.tolist()[0][2:]

In [159]:
def x_y_split_with_rel(data, embeddings, entities, relations):
    data = []
    embedding_size = embeddings.shape[1] - 1
    
    data_frame = pd.DataFrame(data=data, columns=['subject', 'object', 'relation'], dtype=np.int32)
    counts = data_frame.groupby(['subject', 'relation']).count()
    
    embedding_frame = pd.DataFrame(embeddings, dtype='float64')
    
    known_values = {}
    for k,v in counts.iterrows():
        known_values[k] = v['object']

    for e in entities:
        for r in relations:
            val = 0
            if (e, r) in known_values:
                val = known_values[(e, r)]
            data.append([get_embedding(embedding_frame, e, 1), get_embedding(embedding_frame, r, 0), val])
            
    return np.array(data[:, :2*embedding_size]), np.array(data[:, 2*embedding_size:])

In [156]:
def x_y_split(data, embeddings):
    embedding_frame = pd.DataFrame(embeddings, dtype='float64')
    x_cols = embedding_frame.columns.values[2:]
    embedding_frame[0] = embedding_frame[0].astype('int64')
    
    data_frame = pd.DataFrame(data=data, columns=['subject', 'object', 'relation'], dtype=np.int32)
    counts = data_frame.groupby('subject').count()['relation']

    y = counts.sort_index()
    x = embedding_frame[embedding_frame[0].isin(data_frame['subject'].unique())].sort_values(by=0).reset_index(drop=True)[x_cols]
    
    print(embedding_frame.shape)
    print(embedding_frame[0].unique().shape)
    print(data_frame['subject'].unique().shape)
    
    return x, y

In [None]:
entities = np.array(entity_mapping)[:, 1].astype('int32')
relations = np.array(rel_mapping)[:, 1].astype('int32')
    
x, y = x_y_split_with_rel(triples, embeddings, entities, relations)
#x, y = x_y_split(triples, embeddings[:len(entity_mapping)])

  


In [None]:
x.shape, y.shape

In [54]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [58]:
regressors = [
   # ('RandomForest', RandomForestRegressor(20, max_depth=2, random_state=0)),
    ('GradientBoost', GradientBoostingRegressor(learning_rate=0.05, max_depth=4, random_state=0))
]

for name, regr in regressors:
    regr.fit(x_train, y_train)
    print(name, sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test)))

GradientBoost 5653.051215829783
