In [1]:
from sklearn.ensemble import *
from sklearn.linear_model import LinearRegression, BayesianRidge, HuberRegressor
from sklearn.kernel_ridge import KernelRidge
import sklearn.metrics
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np



In [2]:
def load_file_data(path):
    file = open(path, 'r')
    data = []
    for line in file:
        data.append(line.strip("\n").split())
    return data

def load_embedding(path, skipFirst = False):
    file = open(path, 'r')
    ids = []
    data = []
    for line in file:
        if skipFirst:
            skipFirst = False
            continue
        embedding = line.strip("\n").split()
        data.append(embedding)
    return np.array(data).astype('float64')

def convert_ids(ids, lookup_current, lookup_replace):
    """Replaces TransE IDs with our IDs because we use a different mapping.""" 
    # Replace
    new_ids = []
    for i in ids:
        new_ids.append(lookup_replace[lookup_current[str(i)]])
    return new_ids

In [3]:
dataset = 'fb15k'

In [4]:
entity_mapping = load_file_data('../data/'+dataset+'/entity_mapping.txt')
rel_mapping = load_file_data('../data/'+dataset+'/relation_mapping.txt')
sme_mapping = load_file_data('../data/'+dataset+'/id2entity.sme.txt')

In [5]:
our_mapping = dict(entity_mapping + rel_mapping)
sme_mapping = dict(sme_mapping)

In [6]:
train = load_file_data('../data/'+dataset+'/train.txt')
test = load_file_data('../data/'+dataset+'/test.txt')
valid = load_file_data('../data/'+dataset+'/valid.txt')

triples = np.concatenate([train, test, valid])

embeddings = load_embedding('../data/'+dataset+'/embeddings.transe.txt')

In [7]:
new_ids = convert_ids(np.arange(0, len(embeddings)), sme_mapping, our_mapping)
new_ids = np.reshape(np.array(new_ids), (-1, 1))
ent_flag = np.concatenate([np.ones((len(entity_mapping), 1)), np.zeros((len(rel_mapping), 1))])

In [8]:
# Add our ids to the embeddings and a flag whether it is an entitiy or relation embedding
embeddings = np.concatenate([new_ids, ent_flag, embeddings], axis=1)

In [9]:
def get_embedding(embedding_frame, idx, entity = 1):
    return embedding_frame[embedding_frame[0] == idx][embedding_frame[1] == entity].values.tolist()[0][2:]

In [10]:
def x_y_split(data, embeddings):
    embedding_frame = pd.DataFrame(embeddings, dtype='float64')
    x_cols = embedding_frame.columns.values[2:]
    embedding_frame[0] = embedding_frame[0].astype('int64')
    
    data_frame = pd.DataFrame(data=data, columns=['subject', 'object', 'relation'], dtype=np.int32)
    counts = data_frame.groupby('subject').count()['relation']

    y = counts.sort_index()
    x = embedding_frame[embedding_frame[0].isin(data_frame['subject'].unique())].sort_values(by=0).reset_index(drop=True)[x_cols]
    
    print(embedding_frame.shape)
    print(embedding_frame[0].unique().shape)
    print(data_frame['subject'].unique().shape)
    
    return x, y

In [11]:
# TOO IMPERFORMANT! DON'T USE
# def x_y_split_with_rel(triples, embeddings, entities, relations):
#     data = []
#     embedding_size = embeddings.shape[1] - 1
    
#     embedding_frame = pd.DataFrame(embeddings, dtype='float64')
    
#     data_frame = pd.DataFrame(data=triples, columns=['subject', 'object', 'relation'], dtype=np.int32)
#     counts = data_frame.groupby(['subject', 'relation']).count()

#     known_values = {}
#     for k, v in counts.iterrows():
#         known_values[k] = v['object']

#     for e in entities:
#         for r in relations:
#             if (e, r) in known_values:
#                 val = known_values[(e, r)]
#                 data.append(get_embedding(embedding_frame, e, 1) + get_embedding(embedding_frame, r, 0) + [val])

#     return np.array(data)[:, :2*embedding_size], np.array(data)[:, 2*embedding_size:]

In [12]:
entities = np.array(entity_mapping)[:, 1].astype('int32')
relations = np.array(rel_mapping)[:, 1].astype('int32')

x, y = x_y_split(triples, embeddings[:len(entity_mapping)])

(14951, 52)
(14951,)
(14866,)


In [13]:
x.shape, y.shape

((14866, 50), (14866,))

In [14]:
x, y

(             2         3         4         5         6         7         8   \
 0      0.147358  0.010975  0.064803 -0.194929  0.004661  0.018221 -0.019556   
 1     -0.241134  0.118068  0.108729 -0.221990  0.162418 -0.103098 -0.092104   
 2     -0.107155  0.281086  0.149617  0.026729 -0.095332  0.103788  0.218630   
 3     -0.015285  0.137056  0.060449 -0.071384 -0.052484  0.195922  0.123820   
 4      0.123972  0.044149  0.039753 -0.064268  0.181411 -0.010794  0.087110   
 5     -0.063593 -0.235404 -0.034450 -0.039105 -0.119980  0.105660  0.096738   
 6      0.222129 -0.080329 -0.179983 -0.069389 -0.186495  0.014506 -0.052712   
 7      0.205835  0.020896 -0.191479  0.010333 -0.023919 -0.066030 -0.054699   
 8     -0.096553  0.019324  0.020844  0.070537  0.197758 -0.077200 -0.100238   
 9      0.097955  0.302505 -0.094704  0.079591 -0.060834 -0.088358 -0.085544   
 10     0.104754  0.266375 -0.056500  0.099703 -0.096502 -0.091296 -0.084148   
 11    -0.014008  0.188339 -0.102312  0.

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [19]:
regressors = [
   # ('RandomForest', RandomForestRegressor(20, max_depth=2, random_state=0)),
    ('GradientBoost', GradientBoostingRegressor(learning_rate=0.05, max_depth=4, random_state=0)),
    ('LinearRegression', LinearRegression()),
    ('BayesianRidge', BayesianRidge()),
    ('HuberRegressor', HuberRegressor()),
    #('KernelRidge', KernelRidge(random_state=0))
]

for name, regr in regressors:
    regr.fit(x_train, y_train)
    print(name, sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test)))

GradientBoost 6663.470960131599
LinearRegression 6901.345236716377
BayesianRidge 6919.223523152836
HuberRegressor 7500.31012325247
