In [1]:
from elasticsearch import Elasticsearch
import json, os, pickle
import itertools
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from util.es import ES
from util.io import load_dict_from_json
from util.parse_dbpedia import get_type_weights
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy, evaluate, get_type_path

In [2]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains an LTR model.
        
        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [3]:
def get_baseline(dataset='train'):
    baseline = []
    for similarity in ['BM25', 'LM']:
        for n in [5, 10, 20, 50, 100]:
            results = {key: {k:v for k,v in val} for key,val in ES('EC', similarity).generate_baseline_scores(dataset, n).items()}
            baseline.append(results)
        results = {key: {k:v for k,v in val} for key,val in ES('TC', similarity).generate_baseline_scores(dataset).items()}
        baseline.append(results)
    return baseline

In [4]:
BASELINE = {'train':get_baseline()}

In [5]:
ENTITIES = get_type_weights()

In [6]:
# load file with type family features
FAMILY = load_dict_from_json('type_hierarchy_features.json')

In [7]:
T_LENGTH = load_dict_from_json('type_length_features')

In [8]:
T_LABEL = load_dict_from_json('type_label_idf_features')

In [9]:
Q_T_IDF = load_dict_from_json('query_type_idf_features.json')

In [10]:
#Insert the below code into a cell before the extract_features cell:

with open(os.path.join('data','Q_T_features'),'rb') as f:
    qt_features = pickle.load(f)
    
q_ids = load_dict_from_json('q_id_list.json')
q_t_avg = load_dict_from_json('avg_q_t.json')

In [11]:
type_hierarchy, max_depth = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [12]:
def get_Q_T_features(qid,t,lft=qt_features,lqs=q_ids,s='train'):
    '''
    Function for extracting SIMAGGR and JTERMS features for
    query-type pairs.
    Will use average values if type not in the aggregated
    type docuemnt.
    '''
    s = 'val' if s == 'validation' else s
    q_idx = q_ids[s][qid]
    t_label = t[4:]
    if t_label in lft:
        return([1,lft[t_label][s]['JTERMS'][q_idx],\
                lft[t_label][s]['SIMAGGR'][0][q_idx][0][1]])
    else:
        return [0,q_t_avg['JTERMS'],(q_t_avg['SIMAGGR']+1)/2]

In [14]:
def extract_features(qid, t, dataset = 'train'):
    """Returns features to use in ltr
    """        
    # Add baseline features in following order:
    # EC_BM, EC_LM, TC_BM, TC_LM
    features = [es.get(qid, {}).get(t, 0) for es in BASELINE[dataset]]
    
    # add ENTITIES
    features.append(ENTITIES.get(t, 0))
    
    # add type family features
    if t[4:] in FAMILY.keys():
        features.append(FAMILY[t[4:]]['depth'])
        features.append(len(FAMILY[t[4:]]['siblings']))
        features.append(len(FAMILY[t[4:]]['children'])) 
    else:
        # print('type: {} not in hierarchy list'.format(t))
        features +=[0,0,0]

    # add type length
    if t[4:] in T_LENGTH.keys():
        features.append(T_LENGTH[t[4:]]['X']["doc_length_body"])
    else:
        # print('type: {} not in type length list'.format(t))
        features.append(0) 
        
    # add IDF label features
    if t[4:] in T_LABEL.keys():
        for f in T_LABEL[t[4:]]['X'].values():
            features.append(f) 
    else:
        # print('type: {} not in labels list'.format(t))
        features += [0]*4
        
    # add query type IDF features
    if t[4:] in Q_T_IDF.keys():
        for f in Q_T_IDF[t[4:]].values():
            features.append(f) 
    else:
   #     print('type: {} not in labels list'.format(t))
        features += [0]*4
    
    # add Q-T features
    features.extend(get_Q_T_features(qid,t,s=dataset))
    
    return features

In [15]:
def prepare_ltr_training_data(queries):
    X, y = [], []
    
    for i, query in enumerate(queries):
        types = set([*query['type'], *list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE['train']]))])
        for t in types:
            X.append(extract_features(query['id'], t))
            y.append(1 if t in query['type'] else 0)
    
    return X, y

In [16]:
queries = {}
for name in ['train', 'validation', 'test']:
    dataset = load_dict_from_json(f'{name}_set_fixed.json')
    queries[name] = [q for q in dataset if q['category'] == 'resource']

In [17]:
X_train, y_train = prepare_ltr_training_data(queries['train'])
len(y_train), sum(y_train)

(783171, 20459)

In [18]:
# Instantiate an scikit-learn regression model, `clf`.
clf = RandomForestRegressor(max_depth=4, n_estimators=100)

# Instantiate PointWiseLTRModel.
ltr = PointWiseLTRModel(clf)
ltr._train(X_train, y_train)

In [19]:
def get_rankings(ltr, queries, dataset='train'):
    test_rankings = {}
    if dataset not in BASELINE:
        BASELINE[dataset] = get_baseline(dataset)
    
    for i, query in enumerate(queries):
        types = list(set([*query['type'], *list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE[dataset]]))]))
        #types = list(type_hierarchy.keys())
        features = [extract_features(query['id'], t, dataset) for t in types]
        test_rankings[query['id']] = ltr.rank(features, types)
        
    return test_rankings

In [20]:
def get_ground_truth(dataset, type_hierarchy):
    ground_truth = {}
    for query in dataset:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        ground_truth_type = [t for t in query['type'] if t in type_hierarchy]
        if not ground_truth_type:
            continue

        ground_truth[ID] = {
            'category': 'resource',
            'type': ground_truth_type
        }
        
    return ground_truth

In [21]:
def format_outputs(results, queries, type_hierarchy):
    system_output = {}
    
    for query in queries:
        ID = query['id']
        if query['category'] != 'resource':
            continue

        if not any([t for t in query['type'] if t in type_hierarchy]):
            continue

        system_output_type = [t for t,s in results[ID] if t in type_hierarchy] if ID in results else []
        #system_output_type = get_type_path(system_output_type[0], type_hierarchy) if system_output_type else []

        system_output[ID] = {
            'category': 'resource',
            'type': system_output_type
        }
        
    return system_output

In [22]:
# Set which query set to use for evaluation
name = 'validation'

# Evaluate
results = get_rankings(ltr, queries[name], name)
system_output = format_outputs(results, queries[name], type_hierarchy)
ground_truth = get_ground_truth(queries[name], type_hierarchy)
evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 932 questions)
  Accuracy: 1.000
Type ranking (based on 932 questions)
  NDCG@5:  0.420
  NDCG@10: 0.425
