In [1]:
import json, csv
import math
import os
import numpy as np
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

In [2]:
inverted_filename = 'inverted_file.json'
url2content_name = 'url2content.json'

doc2url_name = 'news_data_1/NC_1.csv'
training_name = 'news_data_1/TD.csv'
query_name = 'news_data_1/QS_1.csv'

outcsv_name = 'out.csv'

In [3]:
num_threads = 4
MAXCAND = 300
Okapi_k1 = 2.0
Okapi_b = 0.75
Okapi_k3 = 500.
IDF_epsilon = 1e-4
Ngram_weights = {l:1. for l in range(1, 25)}
MAXGRAM = max(Ngram_weights.keys())
print(MAXGRAM)

24


In [4]:
test = False

In [5]:
# Ngram_weights = {l:(1. - 0.04*(l-2)) for l in range(1, 25)} 0.49
Ngram_weights = {l:0. for l in range(1, 25)}
Ngram_weights[1] = 1.
Ngram_weights[2] = 0.88
Ngram_weights[3] = 0.77

In [6]:
class Document:
    def __init__(self, doc_id):
        self.doc_id = ''
        self.url = ''
        self.tid = []
        self.freq = []
        self.id = doc_id
        self.length = 0
        self.normalized = False
    def normalize(self, avgdl, IDF):        
        for j,t in enumerate(self.tid):
            TF = (Okapi_k1+1.)*self.freq[j]
            dlen_norm = Okapi_k1*(1. - Okapi_b + Okapi_b * (self.length/avgdl)) + self.freq[j]
            self.freq[j] = TF/dlen_norm * IDF[t]
            assert(self.freq[j] > 0)
        self.normalized = True
    def update(self, term_id, tf, is_uniq):
        if not is_uniq:
            for j, t in enumerate(self.tid):
                if t == term_id:
                    freq[j] += tf
                    return
        self.tid.append(term_id)
        self.freq.append(tf)
    def getFileSize(self, contents):
        self.length = len(contents[self.url])
        return self.length
    def update_title(self, vocab):
        pass
    def _process():
        pass

In [7]:
class Query:
    def __init__(self, qid, text, voc):
        self.qid = qid
        self.vec = np.zeros(len(voc), dtype=np.float32)
        self.dim = len(voc)
        self.id = int(qid[-2:])
        self.length = 0
        self.text = text
        
        self._process(text, voc, 1.)
        self.normalize()
    def match(self, doc):
        out = 0.
        for i, t in enumerate(doc.tid):
            out += self.vec[t] * doc.freq[i]
        return out
    def normalize(self):
        for j in range(self.dim):
            self.vec[j] = (Okapi_k3+1.)*self.vec[j] / (Okapi_k3+self.vec[j])
    def _process(self, text, voc, weight):
        size = len(text)
        for start in range(size):
            for ngram in range(1, MAXGRAM+1):
                if Ngram_weights[ngram] <= 0 or ngram > size:
                    break
                    
                end = start + ngram
                word = text[start:end]
                
                if word in voc:
                    index = voc[word]
                    self.vec[index] += weight * Ngram_weights[ngram]
                #else:
                #    print('[debug]', word, 'not present')

In [8]:
# load documents
urlcontents = json.load(open(url2content_name, 'r'))
tfdocs = []
doc_ids = {}
with open(doc2url_name, 'r') as f:
    for i, line in enumerate(f):
        if i == 0:
            print(line)
        else:
            fields = line.strip().split(',') # doc_id, url
            doc = Document(i-1)
            doc.doc_id = fields[0]
            doc.url = fields[1]
            doc.length = len(urlcontents[fields[1]])
            doc_ids[fields[0]] = i-1
            tfdocs.append(doc)
DOC_SZ = len(tfdocs) 

News_Index,News_URL



In [None]:
inverted = json.load(open(inverted_filename, 'r'))

In [None]:
# load vocab
vocab = {}
IDF = {}
for (word, voc) in inverted.items():
    ngram = len(word)
    if Ngram_weights[ngram] <= 0:
        continue
    i = len(vocab)
    vocab[word] = i
    idf = voc['idf']
    N = DOC_SZ / idf
    assert N <= DOC_SZ
    IDF[i] = max(IDF_epsilon, math.log((DOC_SZ - N + 0.5)/(N + 0.5)))
    
#     IDF[i] = voc['idf']
# std::max(IDF_epsilon, std::log((DOC_SZ - N + 0.5)/(N + 0.5)))
VOC_SZ = len(vocab)
print(VOC_SZ)

190376


In [None]:
#load raw tf
for i, (word, voc) in tqdm(enumerate(inverted.items()), total=len(inverted)):
    ngram = len(word)
    if word not in vocab:
        continue
    term_id = vocab[word]
    for pair in voc['docs']:
        for docname, tf in pair.items():
            file_id = doc_ids[docname]
            tfdocs[file_id].update(term_id, tf*Ngram_weights[ngram], True);

HBox(children=(IntProgress(value=0, max=217118), HTML(value='')))




In [None]:
# normalize docs
avgdl = 0.
for d in tfdocs:
    avgdl += d.length
avgdl /= DOC_SZ

for d in tfdocs:
    d.normalize(avgdl, IDF)
for d in tfdocs:
    assert d.normalized
print(avgdl)

738.53705


In [None]:
if not test:
    num_train = 10
    train_scores = {}
    train_qlist = []
    with open(training_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                q, d, score = fields

                score = int(score)

                if q not in train_scores:
                    train_scores[q] = {d:score}                
                    train_qlist.append(Query("train_{:03d}".format(len(train_qlist)), q, vocab))
                else:
                    train_scores[q][d] = score 
    # print("done")
    train_qlist = train_qlist[:num_train]
    print(len(train_qlist))

Query,News_Index,Relevance

10


In [None]:
if not test:
    mean = 0

    for q in train_qlist:
        scores = []

        def subtask(d):
            return (d.doc_id, q.match(d))

        with Pool(num_threads) as p:
            chunksize = 500
    #         scores = list(tqdm(p.imap(subtask, tfdocs, chunksize=chunksize), total=DOC_SZ))
            scores = list(p.imap(subtask, tfdocs, chunksize=chunksize))

        scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]
        assert scores[0][1] > scores[-1][1]

        myrank = [train_scores[q.text][d] if d in train_scores[q.text] else 0 for d, s in scores]
        perfrank = sorted(myrank)[::-1]

        my_dcg = 0
        for i,r in enumerate(myrank):
            if i == 0:
                my_dcg += r
            else:
                my_dcg += r/math.log(i+1, 2)

        perf_dcg = 0
        for i,r in enumerate(perfrank):
            if i == 0:
                perf_dcg += r
            else:
                perf_dcg += r/math.log(i+1, 2)

        cur = my_dcg/perf_dcg
        print(cur)
        mean += cur

    print("[NDCG]", mean / len(train_qlist))

In [None]:
# load queries
if test:
    qlist = []

    with open(query_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                qlist.append(Query(fields[0], fields[1], vocab))

In [None]:
if test:
    outcsv_name = 'out.csv'
    with open(outcsv_name, 'w') as writer:
        writer.write("Query_Index")
        for i in range(MAXCAND):
            writer.write(",Rank_{:03d}".format(i+1))
    #     writer.write("\n")


        for j,q in enumerate(qlist):
            print("Query {}".format(j+1))

            scores = []

            def subtask(d):
                return (d.doc_id, q.match(d))
            #for doc in tqdm(tfdocs):
            #    scores.append((doc.doc_id, q.match(doc)))

            with Pool(num_threads) as p:
                chunksize = 500
                #scores = list(tqdm(p.imap_unordered(subtask, tfdocs, chunksize=chunksize), total=DOC_SZ))
                scores = list(tqdm(p.imap(subtask, tfdocs, chunksize=chunksize), total=DOC_SZ))


            scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]
            assert scores[0][1] > scores[-1][1]

            writer.write('\n'+q.qid)
            for doc_id, s in scores:
                writer.write(','+doc_id)