In [1]:
from elasticsearch import Elasticsearch
import json
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from util.es import ES
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy, evaluate, get_type_path

In [2]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains an LTR model.
        
        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [3]:
with open('./data/train_set_fixed.json', 'r') as f:
    train_set = json.load(f)
    
train_set = [q for q in train_set if q['category'] == 'resource']

In [4]:
def extract_features():
    """Not currently in use"""
    return [0]

In [5]:
res_ec = ES('EC').generate_baseline_scores('validation')
res_tc = ES('TC').generate_baseline_scores('validation')

In [6]:
def prepare_ltr_training_data(queries):
    X, y = [], []

    res_ec = ES('EC').generate_baseline_scores('train')
    res_tc = ES('TC').generate_baseline_scores('train')
    
    for i, query in enumerate(queries):
        types = set()
        
        types_ec = {k:v for k,v in res_ec.get(query['id'], [])}
        types_tc = {k:v for k,v in res_tc.get(query['id'], [])}
        
        types.update(types_ec.keys())
        types.update(types_tc.keys())
        types.update(query['type'])
        
        for ID in types:
            # ec, tc
            features = [types_ec.get(ID, 0), types_tc.get(ID, 0)]
            X.append(features)
            y.append(1 if ID in query['type'] else 0)
    
    return X, y

In [7]:
X_train, y_train = prepare_ltr_training_data(train_set)

In [8]:
# Instantiate an scikit-learn regression model, `clf`.
clf = RandomForestRegressor(max_depth=3, n_estimators=100)

# Instantiate PointWiseLTRModel.
ltr = PointWiseLTRModel(clf)
ltr._train(X_train, y_train)

In [9]:
def get_rankings(ltr, queries, dataset='train'):
    test_rankings = {}

    res_ec = ES('EC').generate_baseline_scores(dataset)
    res_tc = ES('TC').generate_baseline_scores(dataset)
    
    for i, query in enumerate(queries):
        types = set()
        
        types_ec = {k:v for k,v in res_ec.get(query['id'], [])}
        types_tc = {k:v for k,v in res_tc.get(query['id'], [])}
        
        types.update(types_ec.keys())
        types.update(types_tc.keys())
        if not types:
            continue
            
        types = list(types)
        features = [[types_ec.get(ID, 0), types_tc.get(ID, 0)] for ID in types]
        test_rankings[query['id']] = ltr.rank(features, types)
    return test_rankings

In [10]:
def get_ground_truth(dataset, type_hierarchy):
    ground_truth = {}
    for query in dataset:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        ground_truth_type = [t for t in query['type'] if t in type_hierarchy]
        if not ground_truth_type:
            continue

        ground_truth[ID] = {
            'category': 'resource',
            'type': ground_truth_type
        }
        
    return ground_truth

In [11]:
def format_outputs(dataset, type_hierarchy):
    k = 100
    results = get_rankings(ltr,dataset)
    system_output = {}
    for query in dataset:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        if not any([t for t in query['type'] if t in type_hierarchy]):
            continue

        system_output_type = [t for t,s in results[ID] if t in type_hierarchy] if ID in results else None
        system_output_type = get_type_path(system_output_type[0], type_hierarchy) if system_output_type else []

        system_output[ID] = {
            'category': 'resource',
            'type': system_output_type
        }
        
    return system_output

In [12]:
type_hierarchy, max_depth = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [13]:
# Set which dataset to use for evaluation
eval_dataset = train_set

# Evaluate
ground_truth = get_ground_truth(eval_dataset, type_hierarchy)
system_output = format_outputs(eval_dataset, type_hierarchy)
evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 7662 questions)
  Accuracy: 1.000
Type ranking (based on 7662 questions)
  NDCG@5:  0.245
  NDCG@10: 0.208
