In [25]:
from sklearn.ensemble import *
import sklearn.metrics
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np



In [2]:
def load_file_data(path):
    file = open(path, 'r')
    data = []
    for line in file:
        data.append(line.strip("\n").split())
    return np.array(data)

def load_embedding(path, skipFirst = True):
    file = open(path, 'r')
    ids = []
    data = []
    for line in file:
        if skipFirst:
            skipFirst = False
            continue
        embedding = line.strip("\n").split()
        data.append(embedding)
    return np.array(data).astype('float64')

In [3]:
dataset = 'fb15k'
train = load_file_data('data/'+dataset+'/train.txt')
test = load_file_data('data/'+dataset+'/test.txt')
valid = load_file_data('data/'+dataset+'/valid.txt')

embeddings = load_embedding('data/'+dataset+'/embedding.txt')

In [4]:
def x_y_split(data, embeddings):
    embedding_frame = pd.DataFrame(embeddings, dtype='float64')
    x_cols = embedding_frame.columns.values[1:]
    embedding_frame[0] = embedding_frame[0].astype('int64')
    
    data_frame = pd.DataFrame(data=data, columns=['subject', 'object', 'relation'], dtype=np.int32)
    counts = data_frame.groupby('subject').count()['relation']

    y = counts.sort_index()
    x = embedding_frame[embedding_frame[0].isin(data_frame['subject'].unique())].sort_values(by=0).reset_index(drop=True)[x_cols]
    
    return x, y

In [33]:
x, y = x_y_split(np.concatenate([train, test, valid]), embeddings)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [34]:
regressors = [
    ('RandomForest', RandomForestRegressor(20, max_depth=2, random_state=0)),
    ('AdaBoost', AdaBoostRegressor(random_state=0)),
    ('GradientBoost', GradientBoostingRegressor(random_state=0))
]

for name, regr in regressors:
    regr.fit(x_train, y_train)
    print(name, sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test)))

RandomForest 7449.279456740203
AdaBoost 9966.722316849271
GradientBoost 7150.9816642556425
