In [1]:
from elasticsearch import Elasticsearch
import json, os, pickle
import itertools
import numpy as np
from collections import OrderedDict
from sklearn.ensemble import RandomForestRegressor

from util.es import ES
from util.io import load_dict_from_json
from util.parse_dbpedia import get_type_weights
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy, evaluate, get_type_path

from QPC import QPC_model

In [2]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains an LTR model.
        
        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [3]:
def get_baseline(dataset='train', n_list=[5, 10, 20, 50, 100]):
    baseline = []
    for similarity in ['BM25', 'LM']:
        for n in n_list:
            results = {key: {k:v for k,v in val} for key,val in ES('EC', similarity).generate_baseline_scores(dataset, n).items()}
            baseline.append(results)
        results = {key: {k:v for k,v in val} for key,val in ES('TC', similarity).generate_baseline_scores(dataset).items()}
        baseline.append(results)
    return baseline

In [4]:
ENTITIES = get_type_weights()

In [5]:
# load feature files
FAMILY = load_dict_from_json('type_hierarchy_features.json')
T_LENGTH = load_dict_from_json('type_length_features.json')
T_LABEL = load_dict_from_json('type_label_idf_features.json')
Q_T_IDF = load_dict_from_json('type_query_idf_features.json')
q_ids = load_dict_from_json('q_id_list.json')
with open(os.path.join('data','Q_T_features'),'rb') as f:
    qt_features = pickle.load(f)
with open(os.path.join('data','lacking_J_term_types'),'rb') as f:
    lacking_list = pickle.load(f)

In [6]:
type_hierarchy, max_depth = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [7]:
def get_Q_T_features(qid,t,lft=qt_features,lqs=q_ids,s='train',lacking=lacking_list):
    '''
    Function for extracting SIMAGGR and JTERMS features for
    query-type pairs.
    Will use average values if type not in the aggregated
    type docuemnt.
    '''
    s = 'val' if s == 'validation' else s
    q_idx = q_ids[s][qid]
    t_label = t[4:]

    return([lft[t_label][s]['JTERMS'][q_idx],\
            (lft[t_label][s]['SIMAGGR'][q_idx][0]+1)/2])

In [8]:
def extract_features(qid, t, dataset = 'train'):
    """
    Returns features to use in advanced model prediction.
    """        
    # Add baseline features in following order:
    # EC_BM, EC_LM, TC_BM, TC_LM
    features = [es.get(qid, {}).get(t, 0) for es in BASELINE[dataset]]
    
    # add ENTITIES
    features.append(ENTITIES.get(t, 0))
    
    # add type family features
    if t in FAMILY.keys():
        features.append(FAMILY[t]['depth'])
        features.append(len(FAMILY[t]['siblings']))
        features.append(len(FAMILY[t]['children'])) 
    else:
        print('type: {} not in hierarchy list'.format(t))
        features +=[0,0,0]
    
    # add type length
    if t in T_LENGTH.keys():
        features.append(T_LENGTH[t])
    else:
    #    print('type: {} not in type length list'.format(t))
        features.append(0) 
        
    # add IDF label features
    if t in T_LABEL.keys():
        for f in T_LABEL[t]['X'].values():
            features.append(f) 
    else:
  #      print('type: {} not in labels list'.format(t))
        features += [0]*4
    
        # add query type IDF features
    if t in Q_T_IDF.keys():
        for f in Q_T_IDF[t]['X'].values():
            features.append(f) 
    else:
   #     print('type: {} not in labels list'.format(t))
        features += [0]*4
    
    # add Q-T features
    features.extend(get_Q_T_features(qid,t,s=dataset))
    
    return features

In [9]:
def prepare_ltr_training_data(queries):
    X, y = [], []
    
    for i, query in enumerate(queries):
        types = set([*query['type'], *list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE['train']]))])
        for t in types:
            X.append(extract_features(query['id'], t))
            y.append(1 if t in query['type'] else 0)
    
    return X, y

In [76]:
try:
    #loading pretrained model
    with open(os.path.join('saved_models','ltr_unlim_2'),'rb') as f:
        ltr = pickle.load(f)
    print('Loaded pretrained model')
except:
    print('Unable to load model. Training...')
    X_train, y_train = prepare_ltr_training_data(queries['train'])
    # Instantiate an scikit-learn regression model, `clf`.
    clf = RandomForestRegressor(max_depth = 2, n_estimators=1000)

    # Instantiate PointWiseLTRModel.
    ltr = PointWiseLTRModel(clf)
    ltr._train(X_train, y_train)

Loaded pretrained model


In [77]:
def get_rankings(baseline, ltr, queries, dataset='train'):
    test_rankings = {}
    if dataset not in baseline:
        baseline[dataset] = get_baseline(dataset)
    
    for i, query in enumerate(queries):
        if query['id'] in baseline[dataset][0]:
            types = list(set([*list(itertools.chain.from_iterable([list(es.get(query['id'], {}).keys()) for es in BASELINE[dataset]]))]))
            #types = list(type_hierarchy.keys())
            features = [extract_features(query['id'], t, dataset) for t in types]
            if len(types)>0:
                test_rankings[query['id']] = ltr.rank(features, types)
            else:
                test_rankings[query['id']] = []
        else:
            test_rankings[query['id']] = []
        
    return test_rankings

In [78]:
def baseline_rankings(baseline, model_nr, queries, dataset='test', n_list=[20]):
    test_rankings = {}
    if dataset not in baseline:
        baseline[dataset] = get_baseline(dataset, n_list)
        
    for i, query in enumerate(queries):
        if query['id'] in baseline[dataset][model_nr]:
            test_rankings[query['id']] = [(t,s) for t,s in baseline[dataset][model_nr][query['id']].items()]
        else:
            test_rankings[query['id']] = []
    return test_rankings

In [79]:
def get_ground_truth(dataset, type_hierarchy):
    ground_truth = {}
    for query in dataset:
        ID = query['id']

        ground_truth_category = query['category']
        ground_truth_type = [t for t in query['type'] if t in type_hierarchy]

        if not ground_truth_type:
            continue

        ground_truth[ID] = {
            'category': ground_truth_category,
            'type': ground_truth_type
        }
        
    return ground_truth

In [80]:
def format_outputs(results, queries, type_hierarchy):
    system_output = {}
    
    for query in queries:
        ID = query['id']

        system_output_type = [t for t,s in results[ID] if t in type_hierarchy] if ID in results else []
        #system_output_type = get_type_path(system_output_type[0], type_hierarchy) if system_output_type else []

        system_output[ID] = {
            'category': 'resource',
            'type': system_output_type
        }
        
    return system_output

In [81]:
def format_other(queries, pred, classes):
    '''
    Function for formatting non-resource queries.
    '''
    system_output = {}
    for q in queries:
        ID = q['id']
        assert pred[ID] != 0 #checking no resource predicted queries
        if pred[ID] in [1,2,3]: #setting correct category and type
            category = 'literal'
            out_type = classes[pred[ID]]
        else: #boolean
            category = classes[pred[ID]]
            out_type = classes[pred[ID]]
        
        system_output[ID] = {
            'category': category,
            'type': [out_type]
        }
        
    return system_output

In [82]:
#generate Baselines (takes a while, uncomment)
#BASELINE_20 = {'test':get_baseline(dataset='test',n_list=[20])}
#BASELINE = {'train':get_baseline(dataset='train'), 'validation':get_baseline(dataset='validation'),'test':get_baseline(dataset='test')}

In [83]:
#loading queries
queries = {}
for name in ['train', 'validation', 'test']:
    dataset = load_dict_from_json(f'{name}_set_fixed.json')
    #queries[name] = [q for q in dataset if q['category'] == 'resource']
    queries[name] = [q for q in dataset] 

In [84]:
def prepare_result_string(ev, l):
    '''
    Prepares result string for printing.
    '''
    return '\nResults for\n{}\n=====================\nCategory prediction\n   Accuracy:  {}\nType ranking\n   NDCG@5:    {}\n   NDCG@10:   {}\n'.format(l,ev['Accuracy'],ev['NDCG5'],ev['NDCG10'])

In [85]:
#cell for running test
labels = ['Baseline EC k=20 BM25', 'Baseline EC k=20 LM', 'Baseline TC BM25', 'Baseline TC LM', 'Advanced Model']
name = 'test'
text_file = ''

#Initializing pretrained query category classifier, denoted Step 1 in the report
categorizer = QPC_model()
categorizer.model()

#predicting category + potential 'literal' type
prediction = categorizer.predict(queries[name])
classes = categorizer.classes
res_queries = []
other_queries = []
other_pred = []
for query in queries[name]:
    qID = query['id']
    if prediction[qID] == 0:
        res_queries.append(query)
    else:
        other_queries.append(query)

#handle non-resource queries separately
other_output = format_other(other_queries, prediction, classes)
other_ground_truth = get_ground_truth(other_queries, ['boolean','date','number','string'])

for m in [0,1,2,3,4]:
    
    if m<4: #for the baseline models
        results = baseline_rankings(BASELINE_20, m, res_queries, dataset='test', n_list=[20])
    else: #for advanced model
        results = get_rankings(BASELINE, ltr, queries[name], dataset=name)
    system_output = format_outputs(results, res_queries, type_hierarchy)
    ground_truth = get_ground_truth(res_queries, type_hierarchy)
    system_output.update(other_output)
    ground_truth.update(other_ground_truth)
    ev_ret = evaluate(system_output, ground_truth, type_hierarchy, max_depth)
    
    text_file += prepare_result_string(ev_ret,labels[m])


with open('test_results_full_model.txt','w') as f:
        f.write(text_file)




Evaluation results:
-------------------
Category prediction (based on 1625 questions)
  Accuracy: 0.994
Type ranking (based on 1349 questions)
  NDCG@5:  0.458
  NDCG@10: 0.479


Evaluation results:
-------------------
Category prediction (based on 1625 questions)
  Accuracy: 0.994
Type ranking (based on 1349 questions)
  NDCG@5:  0.456
  NDCG@10: 0.468


Evaluation results:
-------------------
Category prediction (based on 1625 questions)
  Accuracy: 0.994
Type ranking (based on 1349 questions)
  NDCG@5:  0.437
  NDCG@10: 0.456


Evaluation results:
-------------------
Category prediction (based on 1625 questions)
  Accuracy: 0.994
Type ranking (based on 1349 questions)
  NDCG@5:  0.468
  NDCG@10: 0.473


Evaluation results:
-------------------
Category prediction (based on 1625 questions)
  Accuracy: 0.994
Type ranking (based on 1349 questions)
  NDCG@5:  0.630
  NDCG@10: 0.626


In [86]:
# Set which query set to use for evaluation
name = 'validation'

# Evaluate
results = get_rankings(BASELINE,ltr, queries[name], name)
system_output = format_outputs(results, queries[name], type_hierarchy)
ground_truth = get_ground_truth(queries[name], type_hierarchy)
ev_ret = evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 932 questions)
  Accuracy: 1.000
Type ranking (based on 932 questions)
  NDCG@5:  0.457
  NDCG@10: 0.460
