In [1]:
from elasticsearch import Elasticsearch
import json
import itertools
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from util.es import ES
from util.parse_dbpedia import get_type_weights
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy, evaluate, get_type_path

In [2]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains an LTR model.
        
        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [3]:
def get_baseline(dataset='train'):
    baseline = []
    for model in ['EC', 'TC']:
        for similarity in ['BM25', 'LM']:
            results = {key: {k:v for k,v in val} for key,val in ES(model, similarity).generate_baseline_scores(dataset).items()}
            baseline.append(results)
    return baseline

In [4]:
BASELINE = {'train':get_baseline()}

In [5]:
ENTITIES = get_type_weights()

In [6]:
type_hierarchy, max_depth = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [7]:
with open('./data/train_set_fixed.json', 'r') as f:
    train_set = json.load(f)
    
train_set = [q for q in train_set if q['category'] == 'resource']

In [8]:
with open('./data/validation_set_fixed.json', 'r') as f:
    validation_set = json.load(f)
    
validation_set = [q for q in validation_set if q['category'] == 'resource']

In [9]:
with open('./data/test_set_fixed.json', 'r') as f:
    test_set = json.load(f)
    
test_set = [q for q in test_set if q['category'] == 'resource']

In [10]:
# load file with type family features
with open('./data/type_hierarchy_features.json', 'r') as f:
    FAMILY = json.load(f)

In [11]:
with open('./data/type_length_features.json', 'r') as f:
    T_LENGTH = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './data/type_length_features.json'

In [None]:
with open('./data/type_label_idf_features.json', 'r') as f:
    T_LABEL = json.load(f)


In [None]:
with open('./data/type_query_idf_features.json', 'r') as f:
    Q_T_IDF = json.load(f)

In [None]:
def extract_features(qid, t, dataset = 'train'):
    """Returns features to use in ltr
    """        
    # Add baseline features in following order:
    # EC_BM, EC_LM, TC_BM, TC_LM
    features = [es.get(qid, {}).get(t, 0) for es in BASELINE[dataset]]
    
    # add ENTITIES
    features.append(ENTITIES[t])
    
    # add type family features
    if t[4:] in FAMILY.keys():
        features.append(FAMILY[t[4:]]['depth'])
        features.append(len(FAMILY[t[4:]]['siblings']))
        features.append(len(FAMILY[t[4:]]['children'])) 
    else:
     #   print('type: {} not in hierarchy list'.format(t))
        features +=[0,0,0]
    
    # add type length
    if t[4:] in T_LENGTH.keys():
        features.append(T_LENGTH[t[4:]]['X']["doc_length_body"])
    else:
    #    print('type: {} not in type length list'.format(t))
        features.append(0) 
        
    # add IDF label features
    if t[4:] in T_LABEL.keys():
        for f in T_LABEL[t[4:]]['X'].values():
            features.append(f) 
    else:
   #     print('type: {} not in labels list'.format(t))
        features += [0]*4
    
        # add query type IDF features
    if t[4:] in Q_T_IDF.keys():
        for f in Q_T_IDF[t[4:]].values():
            features.append(f) 
    else:
   #     print('type: {} not in labels list'.format(t))
        features += [0]*4

    return features

In [None]:
def prepare_ltr_training_data(queries):
    X, y = [], []
    empty = []
    count = 0
    for i, query in enumerate(queries):
        types = set([*query['type'], *list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE['train']]))])
        for t in types:
            X.append(extract_features(query['id'], t))
            y.append(1 if t in query['type'] else 0)
  #          if t[4:] not in T_LABEL.keys() and t[4:] not in empty:
  #              empty.append(t[4:])
  #              count += 1
  #  print('Number of queries not in short abstract TC index: ', len(empty))
 #   print(empty)
    return np.array(X), np.array(y)

In [None]:
X_train, y_train = prepare_ltr_training_data(train_set)
len(y_train), sum(y_train)

In [None]:
X_validation, y_validation = prepare_ltr_training_data(validation_set)
len(y_validation), sum(y_validation)

In [None]:
X_test, y_test = prepare_ltr_training_data(test_set)
len(y_test), sum(y_test)

In [None]:
# Instantiate an scikit-learn regression model, `clf`.
clf = RandomForestRegressor(max_depth=3, n_estimators=100)

# Instantiate PointWiseLTRModel.
ltr = PointWiseLTRModel(clf)
ltr._train(X_train, y_train)

In [None]:
def get_rankings(ltr, queries, dataset='train'):
    test_rankings = {}
    if dataset not in BASELINE:
        BASELINE[dataset] = get_baseline(dataset)
    
    for i, query in enumerate(queries):
        types = set([*query['type'], *list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE[dataset]]))])
        features = [extract_features(query['id'], t, dataset) for t in types] 
        test_rankings[query['id']] = ltr.rank(features, list(types))
        
    return test_rankings

In [None]:
def get_ground_truth(dataset, type_hierarchy):
    ground_truth = {}
    for query in dataset:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        ground_truth_type = [t for t in query['type'] if t in type_hierarchy]
        if not ground_truth_type:
            continue

        ground_truth[ID] = {
            'category': 'resource',
            'type': ground_truth_type
        }
        
    return ground_truth

In [None]:
def format_outputs(dataset, type_hierarchy):
    results = get_rankings(ltr,dataset)
    system_output = {}
    
    for query in dataset:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        if not any([t for t in query['type'] if t in type_hierarchy]):
            continue

        system_output_type = [t for t,s in results[ID] if t in type_hierarchy] if ID in results else None
  #      system_output_type = get_type_path(system_output_type[0], type_hierarchy) if system_output_type else []

        system_output[ID] = {
            'category': 'resource',
            'type': system_output_type
        }
        
    return system_output

In [None]:
#load features as dictionary
def load_features(filename):
    try:
        with open(filename,'r') as f:
            features = json.load(f)
        return features 
    except:
        return None

In [None]:
# Set which dataset to use for evaluation
eval_dataset = train_set

# Evaluate
ground_truth = get_ground_truth(eval_dataset, type_hierarchy)
system_output = format_outputs(eval_dataset, type_hierarchy)
evaluate(system_output, ground_truth, type_hierarchy, max_depth)

In [None]:
# Set which dataset to use for evaluation
#eval_dataset = train_set

# Evaluate
#ground_truth = get_ground_truth(eval_dataset, type_hierarchy)
#system_output = format_outputs(eval_dataset, type_hierarchy)
#evaluate(system_output, ground_truth, type_hierarchy, max_depth)