In [1]:
import os
import numpy as np
from collections import defaultdict
import xml.etree.ElementTree as ET
from collections import Counter

# filename
DATA_DIR = '../data'
TRAIN_ANS_FILE = os.path.join(DATA_DIR, 'ans_train.csv')
QUERY_TRAIN_FILE = os.path.join(DATA_DIR, 'query-train.xml')
QUERY_TEST_FILE = os.path.join(DATA_DIR, 'query-test.xml')
FILE_LIST = os.path.join(DATA_DIR, 'file-list')
INV_FILE = os.path.join(DATA_DIR, 'inverted-file')
VOCAB_FILE = os.path.join(DATA_DIR, 'vocab.all')
OUTPUT_FILE = 'output.csv'

# building maps
docname2id = dict()
with open(FILE_LIST, 'r') as f:
    for idx, line in enumerate(f):
        docname2id[line.strip()] = idx
id2docname = {y:x for x, y in docname2id.items()}

word2id = dict()
with open(VOCAB_FILE, 'r') as f:
    f.readline()
    for idx, line in enumerate(f, 1):
        word2id[line.strip()] = idx

word_freq = dict()
gram_freq = dict()
gram2id = dict()
with open(INV_FILE, 'r') as f:
    idx = 0
    while True:
        line = f.readline().strip()
        if not line:
            break
        id_1, id_2, doc_count = [int(i) for i in line.split(' ')]
        doc_records = defaultdict(int)
        for i in range(doc_count):
            doc_id, freq = [int(i) for i in f.readline().strip().split(' ')]
            doc_records[doc_id] = freq
        if id_2 == -1:
            word_freq[(id_1)] = doc_records
        gram_freq[(id_1, id_2)] = doc_records
        gram2id[(id_1, id_2)] = idx
        idx += 1

In [2]:
doc2len = defaultdict(int)
for word_id, records in word_freq.items():
    for doc_id, freq in records.items():
        doc2len[doc_id] += freq
avdl = sum([length for _, length in doc2len.items()]) / len(doc2len)

In [5]:
tree = ET.ElementTree(file=QUERY_TRAIN_FILE)
root = tree.getroot()

query_num = len(root)
word_num = len(word2id) + 1
doc_num = len(docname2id)

In [14]:
def generate_candidates_weight(k_1=1.5, b=0.75):
    weight = np.zeros((doc_num, word_num))
    for word_id, records in word_freq.items():
        IDF = np.log((doc_num - len(records) + 0.5) / (len(records) + 0.5))
        for doc_id, freq in records.items():
            TF = (k_1 + 1) * freq / (freq + k_1 * (1 - b + b * doc2len[doc_id] / avdl))
            weight[doc_id, word_id] = TF * IDF
    return weight

def generate_queries_weight(ka=100):
    queries = np.zeros((query_num, word_num))
    for query_id, child in enumerate(root):
        query = list(''.join(child[4].text.strip('\n。 ').split('、')))
        for word, freq in Counter(query).items():
            TF = (ka + 1) * freq / (freq + ka)
            queries[query_id, word2id[word]] = TF
    return queries

In [12]:
def MAP(top100):
    AP = []
    with open(TRAIN_ANS_FILE, 'r') as f:
        f.readline()
        for line, rank in zip(f, top100):
            idx, answer = line.strip().split(',')
            answer = set(answer.split())
            rank = [(id2docname[i].split('/')[-1].lower()) for i in rank]
            hit = 0
            P = []
            for rank_i, rank in enumerate(rank, 1):
                if rank in answer:
                    hit += 1
                    P.append(hit / rank_i)
            AP.append(sum(P) / len(P))
    return sum(AP) / len(AP)

In [19]:
for k_1 in [1, 1.2, 1.4, 1.6, 1.8, 2.0]:
    for ka in [0, 5, 10, 50, 100, 300, 500, 1000]:
        candidates = generate_candidates_weight(k_1, b=0.75)
        queries = generate_queries_weight(ka)
        ret = np.matmul(queries, np.transpose(candidates)).argsort(axis=1)[:, ::-1][:, :100]
        print('k_1: %.2f, ka: %.2f, MPA: %.8f' % (k_1, ka, MAP(ret)))

k_1: 1.00, ka: 0.00, MPA: 0.74668724
k_1: 1.00, ka: 5.00, MPA: 0.76675177
k_1: 1.00, ka: 10.00, MPA: 0.76820874
k_1: 1.00, ka: 50.00, MPA: 0.76123452
k_1: 1.00, ka: 100.00, MPA: 0.76131936
k_1: 1.00, ka: 300.00, MPA: 0.74796550
k_1: 1.00, ka: 500.00, MPA: 0.74808396
k_1: 1.00, ka: 1000.00, MPA: 0.74827093
k_1: 1.20, ka: 0.00, MPA: 0.74890794
k_1: 1.20, ka: 5.00, MPA: 0.77350216
k_1: 1.20, ka: 10.00, MPA: 0.76841307
k_1: 1.20, ka: 50.00, MPA: 0.74993208
k_1: 1.20, ka: 100.00, MPA: 0.74674577
k_1: 1.20, ka: 300.00, MPA: 0.74795278
k_1: 1.20, ka: 500.00, MPA: 0.74672377
k_1: 1.20, ka: 1000.00, MPA: 0.74683806
k_1: 1.40, ka: 0.00, MPA: 0.74795993
k_1: 1.40, ka: 5.00, MPA: 0.77550306
k_1: 1.40, ka: 10.00, MPA: 0.77280180
k_1: 1.40, ka: 50.00, MPA: 0.75247331
k_1: 1.40, ka: 100.00, MPA: 0.75039241
k_1: 1.40, ka: 300.00, MPA: 0.74818632
k_1: 1.40, ka: 500.00, MPA: 0.74390922
k_1: 1.40, ka: 1000.00, MPA: 0.74185115
k_1: 1.60, ka: 0.00, MPA: 0.74507317
k_1: 1.60, ka: 5.00, MPA: 0.77809484
k_1: 

In [20]:
# Testing Part
tree = ET.ElementTree(file=QUERY_TEST_FILE)
root = tree.getroot()
query_num = len(root)

candidates = generate_candidates_weight(k_1=1.6, b=0.75)
queries = generate_queries_weight(ka=5.0)
ret = np.matmul(queries, np.transpose(candidates)).argsort(axis=1)[:, ::-1][:, :100]

with open(OUTPUT_FILE, 'w+') as f:
    print('query_id,retrieved_docs', file=f)
    for idx, result in enumerate(ret, 11):
        print(str(idx).zfill(3), ' '.join([(id2docname[i].split('/')[-1].lower()) for i in result]), sep=',', file=f)