# Assignment 3 - Part 3

In [1]:
import urllib
import requests
import json
import numpy as np
import math
import os
from sklearn.ensemble import RandomForestRegressor

In [2]:
API = "http://gustav1.ux.uis.no:5002"

BASIC_INDEX_NAME = "clueweb12b"
ANCHORS_INDEX_NAME = "clueweb12b_anchors"
QRELS_FILE = "data/qrels.csv"
def get_index_name(field):
    return ANCHORS_INDEX_NAME if field == "anchors" else BASIC_INDEX_NAME

## Utility functions

Loading queries

In [3]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

Loading qrels

In [4]:
def load_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, "r") as fin:
        i = 0
        for line in fin.readlines():
            i += 1
            if i == 1:  # skip header line
                continue
            qid, doc_id, rel = line.strip().split(",", 2)
            if qid not in qrels:
                qrels[qid] = {}
            qrels[qid][doc_id] = int(rel)
    return qrels
ground_truth = load_qrels(QRELS_FILE)

Loading features file

In [5]:
def load_features(features_file):
    X, y, qids, doc_ids = [], [], [], []
    with open(features_file, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

In [6]:
# Method to calculate NDCG, used in Evaluate
def ndcg(gains, ideal_gains, n):
    actual = gains[0]
    ideal = ideal_gains[0]
    for i in range(1, min(n, len(gains))):
        actual += gains[i] / math.log2(i + 1)
    for i in range(1, min(n, len(ideal_gains))):
        ideal += ideal_gains[i] / math.log2(i + 1)
    return actual / ideal

def evaluate(ranks, gtruth, field):
    ndcg10 = 0
    ndcg20 = 0
    for qid, ranking in ranks.items():
        gt = gtruth[qid]
        ideal_gains = sorted([relevance for _, relevance in gt.items()], reverse=True) 
        gains = []
        for doc_id in ranking:
            gains.append(gt.get(doc_id, 0)) if gt.get(doc_id, 0) >= 0 else gains.append(0)
        
        #Calculate ndcg
        ndcg10 += ndcg(gains, ideal_gains, 10)
        ndcg20 += ndcg(gains, ideal_gains, 20)
    print("Evaluating {}".format(field))
    print("NDCG@10")
    print(ndcg10 / len(ranks))
    print("NDCG@20")
    print(ndcg20 / len(ranks))
    print()
    

In [7]:
def minmax_norm(features, fid):
    """Normalizes a given feature."""
    # this is to be done for each query separately
    min_x = 10000 # sufficiently large number
    max_x = -10000 # # sufficiently small number
    for docid in features:
        if not features[docid].get(fid):
            continue
        x = features[docid][fid]
        if x < min_x:
            min_x = x
        if x > max_x:
            max_x = x
    for docid in features:
        if not features[docid].get(fid):
            continue
        x = features[docid][fid]
        try:
            features[docid][fid] = (x - min_x) / (max_x - min_x)
        except:
            features[docid][fid] = 0

In [8]:
def get_pagerank(doc_id):
    tot_size = os.path.getsize("data/pagerank.txt")
    limits = [0, tot_size]
    old_did = None
    with open("data/pagerank.txt", "rb") as f:
        while True:
            pos = sum(limits)//2
            f.seek(pos)
            while f.read(1) not in (b"\n", b''):
                if f.tell()<3:
                    f.seek(0)
                    break
                else:
                    f.seek(-2, 1)
            did, pr = f.readline().split()
            did = did.decode()
            if did == old_did:
                return -1
            else:
                old_did = did
            if did == doc_id:
                return float(pr)
            if did>doc_id:
                limits[1] = pos
            else:
                limits[0] = pos

## API functions

Issuing a search query againt the API

In [9]:
CACHE_DIR = "cache"
CACHE_DIR_SEARCH = CACHE_DIR + "/search"
CACHE_DIR_TERMVECTORS = CACHE_DIR + "/termvectors"

def search(indexname, query, field, size=10):
    cache_file = CACHE_DIR_SEARCH + "/" + indexname + "_" + query + "_" + field + "_" + str(size)
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)
        

Get term vector

In [10]:
def term_vectors(indexname, doc_id, term_statistics=False):
    """
    param term_statistics: Boolean; True iff term_statistics are required.
    """
    cache_file = CACHE_DIR_TERMVECTORS + "/" + indexname + "_" + doc_id + "_" + str(term_statistics)
    url = "/".join([API, indexname, doc_id, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

Analyze query (return a list of index terms)

In [11]:
def analyze_query(indexname, query):
    query_terms = []
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    tokens = json.loads(response).get("tokens", [])
    for t in sorted(tokens, key=lambda x: x["position"]):
        query_terms.append(t["token"])

    return query_terms

## Pointwise LTR class

In [12]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [13]:
FIELDS = ["title", "content", "anchors"]
LAMBDA = 0.1
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for field in FIELDS:
            self._probs[field] = {}
            for t in qterms:
                self._probs[field][t] = self.__get_prob(field, t)
        
    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        index_name = get_index_name(field)
        hits = search(index_name, term, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True`)
            
            tv = term_vectors(index_name, doc_id, term_statistics=True)["term_vectors"][field]
            ttf = tv["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
            sum_ttf = tv["field_statistics"]["sum_ttf"]
            return ttf / sum_ttf
        
        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

In [14]:
def score_lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed
    index_name = get_index_name(field)
    tv = term_vectors(index_name, doc_id).get("term_vectors", {})
    # compute field length $|d|$
    len_d = 0  # document field length initialization
    if field in tv:  # that document field may be NOT empty
        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        if field in tv:
            Pt_d = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d  # $P(t|d)$
        else:  # that document field is empty
            Pt_d = 0
        Pt_C = clm.prob(field, t)  # $P(t|C)$
        Pt_theta_d = (1 - LAMBDA) * Pt_d + LAMBDA * Pt_C  # $P(t|\theta_{d})$ with J-M smoothing
        score += math.log(Pt_theta_d) if Pt_theta_d > 0 else 0  # Pt_theta_d is 0 if t doesn't occur in any doc for that field, even with smoothing
    
    return score

## Feature computation

The total number of candidate documents to retrieve from Elasticsearch. Should not be set higher than 200 (otherwise things get unreasonably slow).

In [15]:
NUM_DOCS = 400

Total number of features (features will be indexed 1..NUM_FEAT)

In [25]:
NUM_FEAT = 8

#### Computing the feature vectors for a given query

  - We retrieve top `NUM_DOCS` documents for each field (title, content, anchors).
  - We ignore those docs that don't have a score in the content field. This also serves as a simple and pragmatic way of filtering out docs that are not in ClueWeb Category B.
  
This function is used both both when training and when applying the model. When training, the target relevance labels will need to be assigned to each document. That is done in `get_training_data()`.

In [26]:
def get_features(qid, query):
    feats = {}
    print("Getting features for query #{} '{}'".format(qid, query))                

    # Analyze query (will be needed for some features)
    qterms = analyze_query("clueweb12b", query)
    
    # Feature 1: BM25 content score
    res1 = search("clueweb12b", query, "content", size=NUM_DOCS)
    # Initializing feature vector with values for Feature 1
    print("\tElasticsearch content field ...")
    for doc in res1.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        feats[doc_id] = {1: doc.get("_score")}
        
    # Feature 2: BM25 title score
    print("\tElasticsearch title field ...")
    res2 = search("clueweb12b", query, "title", size=NUM_DOCS)
    for doc in res2.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][2] = doc.get("_score")

    # Feature 3: BM25 anchors score
    # NOTE: we retrieve more candidate documents here
    print("\tElasticsearch anchors field ...")
    res3 = search("clueweb12b_anchors", query, "anchors", size=NUM_DOCS*10)
    for doc in res3.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][3] = doc.get("_score")
                
    # TODO: computation of additional features comes here 
    print("\tLM content field ...")
    clm = CollectionLM(qterms)
    for doc in res1.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][4] = score_lm(clm, qterms, doc_id, "content")
    clm = CollectionLM(qterms)
    print("\tLM title field ...")
    for doc in res2.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][5] = score_lm(clm, qterms, doc_id, "title")
    clm = CollectionLM(qterms)
    print("\tLM anchor field ...")
    for doc in res3.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][6] = score_lm(clm, qterms, doc_id, "anchors")
    
    print("\tCalculating query lengths ...")
    for doc_id in feats:
        feats[doc_id][7] = len(query.split(" "))
            

    # This had a negative impact on ndcg@20 
    print("\tRetrieving total hits ...")
    for doc_id in feats:
        feats[doc_id][8] = res1.get('hits', {}).get('total', {})
    
    # TODO: we can apply feature normalization here
    print("\tNormalizing features ...")
    for fid in range(1, 7):
        minmax_norm(feats, fid)
    print("Done!")
    return feats

## Main

### Training model

Queries and qrels for training

In [27]:
QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
FEATURES_FILE = "data/features.txt"

In [28]:
queries = load_queries(QUERY_FILE)
qrels = load_qrels(QRELS_FILE)

Create the complete training data set (feature vectors and corresponding labels) and write it to a file

In [29]:
features = {}
def get_training_data(queries, qrels, output_file):
    with open(output_file, "w") as fout:
        for qid, query in sorted(queries.items()):
            # get feature vectors
            feats = get_features(qid, query)
            
            print(len(feats))
            features[qid] = feats
            # assign target labels and write to file
            for doc_id, feat in feats.items():
                rel = 1 if doc_id in qrels.get(qid, []) else 0
                # we use -1 as value for the missing features
                for fid in range(1, NUM_FEAT + 1):
                    if fid not in feat:
                        feat[fid] = -1
                # write to file
                feat_str = ['{}:{}'.format(k,v) for k,v in sorted(feat.items())]
                fout.write(" ".join([str(rel), qid, doc_id] + feat_str) + "\n")

In [30]:
get_training_data(queries, qrels, FEATURES_FILE)



Getting features for query #201 'raspberry pi'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #202 'uss carl vinson'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #203 'reviews of les miserables'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #204 'rules of golf'
	Elasticsearch content field ...


	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #228 'hawaiian volcano observatories'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #229 'beef stroganoff recipe'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query lengths ...
	Retrieving total hits ...
	Normalizing features ...
Done!
400
Getting features for query #230 'world's biggest dog'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculating query le

Load training data from file

In [31]:
X, y, qids, doc_ids = load_features(FEATURES_FILE)
qids_unique = list(set(qids))
print(len(set(qids)))
print(len(y))

50
20000


5 FOLD VALIDATITON

In [32]:
FOLDS = 5
OUTPUT_FILE = "data/ltr_qd_q.txt"
fout = open(OUTPUT_FILE, "w")
# write header
fout.write("QueryId,DocumentId\n")
    
for f in range(FOLDS):
    print("Fold #{}".format(f + 1))
    
    train_qids, test_qids = [], []  # holds the IDs of train and test queries
    train_ids, test_ids = [], []  # holds the instance IDs (indices in X )

    for i in range(len(qids_unique)):
        qid = qids_unique[i]
        if i % FOLDS == f:  # test query
            test_qids.append(qid)
        else:  # train query
            train_qids.append(qid)

    train_X, train_y = [], []  # training feature values and target labels
    test_X = []  # for testing we only have feature values
    
    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(y[i])
        else:
            test_X.append(X[i])
    print(np.unique(map(len, train_X)))
    print(np.unique(map(len, train_y)))
    print(np.asarray(train_X).dtype)
    # Create and train LTR model
    print("\tTraining model ...")
    clf = RandomForestRegressor(max_depth=3, random_state=0)
    ltr = PointWiseLTRModel(clf)
    ltr._train(train_X, train_y)
    
    # Apply LTR model on the remaining fold (test queries)
    print("\tApplying model ...")
    
    for qid in set(test_qids):
        print("\t\tRanking docs for queryID {}".format(qid))
        # Collect the features and docids for that (test) query `qid`
        test_ft, test_docids = [], []
        for i in range(len(X)):
            if qids[i] == qid:
                test_ft.append(X[i])
                test_docids.append(doc_ids[i])
        
        # Get ranking
        r = ltr.rank(test_ft, test_docids)    
        # Write the results to file
        for doc, score in r:
            fout.write(qid + "," + doc + "\n")
        
fout.close()

rankings = {}
with open(OUTPUT_FILE, "r") as fin:
    docs = []
    for line in fin.readlines():
        if line.startswith('QueryId'):
            continue
        qid, doc_id = line.strip().split(",")
        if qid not in rankings: 
            rankings[qid] = []
        rankings[qid].append(doc_id)

evaluate(rankings, ground_truth, "test")

Fold #1
[<map object at 0x000000000A2F1F98>]
[<map object at 0x000000000A2F7780>]
float64
	Training model ...
	Applying model ...
		Ranking docs for queryID 229
		Ranking docs for queryID 235
		Ranking docs for queryID 246
		Ranking docs for queryID 249
		Ranking docs for queryID 243
		Ranking docs for queryID 234
		Ranking docs for queryID 215
		Ranking docs for queryID 231
		Ranking docs for queryID 244
		Ranking docs for queryID 217
Fold #2
[<map object at 0x000000000B5C0BA8>]
[<map object at 0x000000000B5C0E10>]
float64
	Training model ...
	Applying model ...
		Ranking docs for queryID 216
		Ranking docs for queryID 203
		Ranking docs for queryID 212
		Ranking docs for queryID 247
		Ranking docs for queryID 224
		Ranking docs for queryID 205
		Ranking docs for queryID 225
		Ranking docs for queryID 230
		Ranking docs for queryID 232
		Ranking docs for queryID 236
Fold #3
[<map object at 0x000000000BB003C8>]
[<map object at 0x000000000BB001D0>]
float64
	Training model ...
	Applying 

### Applying model on unseen queries

In [36]:
QUERY2_FILE = "data/queries2.txt"
FEATURES2_FILE = "data/features2.txt"
OUTPUT_FILE = "data/ltr2.txt"
TOP_DOCS = 20  # this many top docs to write to output file

In [37]:
queries2 = load_queries(QUERY2_FILE)
print(ltr)

<__main__.PointWiseLTRModel object at 0x0000000009F60DA0>


Apply model and write results to output file

In [38]:
output_format = "tbrec"

with open(OUTPUT_FILE, "w") as fout:
    fout.write("QueryId,DocumentId\n")
    for qid, query in sorted(queries2.items()):
        # Get feature vectors
        feats = get_features(qid, query)
        
        # Convert into the format required by the `PointWiseLTRModel` class
        # and deal with missing feature values
        doc_fts = []
        doc_ids = []
        
        for doc_id, feat in feats.items():
            for fid in range(1, NUM_FEAT + 1):
                if fid not in feat:
                    feat[fid] = -1
            doc_fts.append(np.array([float(val) for fid, val in sorted(feat.items())]))
            doc_ids.append(doc_id)
        
        # Get ranking
        r = ltr.rank(doc_fts, doc_ids)    
        # Write the results to file
        rank = 1
        for doc_id, score in r:
            if rank <= TOP_DOCS:
                if output_format == "trec":
                    fout.write(("\t".join(["{}"] * 6) + "\n").format(qid, "Q0", doc_id, str(rank),
                                                                 str(score), "A3_3_Baseline"))
                else: 
                    fout.write(qid + "," + doc_id + "\n")                            
            rank += 1

Getting features for query #251 'identifying spider bites'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #252 'history of orcas island'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #253 'tooth abscess'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normaliz

	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #276 'how has african american music influence history'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #277 'bewitched cast'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #278 'mister rogers'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch an

	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!
Getting features for query #300 'how to find the mean'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchor field ...
	Calculationg title & content lengths ...
	Calculating query lengths ...
	Retrieving PageRank
	Normalizing features ...
Done!


In [None]:

ltr_ranks = {}
with open(OUTPUT_FILE, "r") as fin:
    docs = []
    for line in fin.readlines():
        if line.startswith('QueryId'):
            continue
        qid, doc_id = line.strip().split(",")
        if qid not in ltr: 
            ltr[qid] = []
        ltr[qid].append(doc_id)

evaluate(ltr, ground_truth, "test")