In [1]:
import json, csv
import math
import os
import numpy as np
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

In [2]:
inverted_filename = 'inverted_file.json'
url2content_name = 'url2content.json'
url2title_name = 'url2titles.json'
standlexi_name = 'extern/sent_lexicon2.json'

doc2url_name = 'news_data_1/NC_1.csv'
training_name = 'news_data_1/TD.csv'
query_name = 'news_data_1/QS_1.csv'

outcsv_name = 'out.csv'

In [3]:
num_threads = 4
MAXCAND = 300

Okapi_k1 = 2.
Okapi_b = 0.75
Okapi_k3 = 500.


IDF_epsilon = 1e-4

In [4]:
test = False

In [5]:
Ngram_weights = {}#{l:0. for l in range(1, 25)}
Ngram_weights[1] = 1.
Ngram_weights[2] = 1.04 #1.04: 5886
Ngram_weights[3] = 0.4
MAXGRAM = max(Ngram_weights.keys())
print(MAXGRAM)

3


In [6]:
class Document:
    def __init__(self, doc_id):
        self.doc_id = ''
        self.url = ''
        self.tfreq = {}
        self.id = doc_id
        self.length = 0
        self.normalized = False
    def normalize(self, avgdl, IDF):
        for t, f in self.tfreq.items():
            TF = (Okapi_k1+1.)*f
            dlen_norm = Okapi_k1*(1. - Okapi_b + Okapi_b * (self.length/avgdl)) + f
            self.tfreq[t] = TF/dlen_norm * IDF[t]
            assert(self.tfreq[t] > 0)
        self.normalized = True
    def update(self, term_id, tf):
        if term_id in self.tfreq:
            self.tfreq[term_id] += tf
        else:
            self.tfreq[term_id] = tf
    def getFileSize(self, contents):
        self.length = len(contents[self.url])
        return self.length
    def update_title(self, vocab):
        r = requests.get(self.url)
        soup = BeautifulSoup(r.content)
        if soup.title == None:
            print('[warning]', self.doc_id, 'has no title!')
        else:
            title = str(soup.title).replace('<title>', '').replace('</title>', '')
            self._process(title, vocab, 1.)
    def _process(self, text, voc, weight):
        size = len(text)
        for start in range(size):
            for ngram in range(1, MAXGRAM+1):
                if Ngram_weights[ngram] <= 0 or ngram > size:
                    break
                    
                end = start + ngram
                word = text[start:end]
                
                if word in voc:
                    index = voc[word]
                    self.update(index, weight * Ngram_weights[ngram])

In [7]:
def utf8len(s):
    return len(s.encode('utf-8'))

In [8]:
# load documents
urlcontents = json.load(open(url2content_name, 'r'))
tfdocs = {}
with open(doc2url_name, 'r') as f:
    for i, line in enumerate(f):
        if i == 0:
            print(line)
        else:
            fields = line.strip().split(',') # doc_id, url
            doc = Document(i-1)
            doc.doc_id = fields[0]
            doc.url = fields[1]
            doc.length = utf8len(urlcontents[fields[1]])
            tfdocs[fields[0]] = doc
DOC_SZ = len(tfdocs) 

News_Index,News_URL



In [9]:
inverted = json.load(open(inverted_filename, 'r'))

In [10]:
# load vocab
vocab = {}
IDF = {}
for (word, voc) in inverted.items():
    ngram = len(word)
    if ngram not in Ngram_weights:
        continue
    i = len(vocab)
    vocab[word] = i
    idf = voc['idf']
    N = DOC_SZ / idf
    assert N <= DOC_SZ
    IDF[i] = max(IDF_epsilon, math.log((DOC_SZ - N + 0.5)/(N + 0.5)))
#     IDF[i] = max(IDF_epsilon, math.log(idf))
    
VOC_SZ = len(vocab)
print(VOC_SZ)

190376


In [11]:
#load raw tf
for i, (word, voc) in tqdm(enumerate(inverted.items()), total=len(inverted)):
    ngram = len(word)
    if word not in vocab:
        continue
    term_id = vocab[word]
    for pair in voc['docs']:
        for docname, tf in pair.items():           
            tfdocs[docname].update(term_id, tf*Ngram_weights[ngram]);

HBox(children=(IntProgress(value=0, max=217118), HTML(value='')))




In [12]:
urltitles = json.load(open(url2title_name, 'r'))
for d in tqdm(tfdocs.values()):
    title = urltitles[d.url].strip()
    if title == '':
        #print('[warning]', d.doc_id, 'has no title !')
        pass
    else:
        d._process(title, vocab, 1.)
        d.length += utf8len(title)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [13]:
# normalize docs
avgdl = 0.
for d in tfdocs.values():
    avgdl += d.length
avgdl /= DOC_SZ

for d in tfdocs.values():
    d.normalize(avgdl, IDF)
for d in tfdocs.values():
    assert d.normalized
print(avgdl)

2177.24542


In [14]:
R_a = 0.75
R_b = 0.15
R_c = 0.1

In [15]:
class Query:
    def __init__(self, qid, text, voc):
        self.qid = qid
        self.vec = {}
        self.dim = len(voc)
        self.id = int(qid[-2:])
        self.length = 0
        self.text = text
        
        self._process(text, voc, 1.)
        self.normalize()
    def match(self, doc_freq):
        out = 0.
        for t, f in doc_freq.items():
            if t in self.vec:
                out += self.vec[t] * f
        return out
    def match_2(self, doc_freq):
        out = 0.
        for t, f in self.vec.items():
            if t in doc_freq:
                out += f * doc_freq[t]
        return out
    def normalize(self):
        for t, f in self.vec.items():
            self.vec[t] = (Okapi_k3+1.)*self.vec[t] / (Okapi_k3+self.vec[t])
        
    def _process(self, text, voc, weight):
        size = len(text)
        for start in range(size):
            for ngram in range(1, MAXGRAM+1):
                if Ngram_weights[ngram] <= 0 or ngram > size:
                    break
                    
                end = start + ngram
                word = text[start:end]
                
                if word in voc:
                    index = voc[word]
                    prev = self.vec[index] if index in self.vec else 0
                    self.vec[index] = prev + weight * Ngram_weights[ngram]
   
    def feedback(self, rel, irrel, tfdocs):
        pos = {}
        neg = {}
        dim = self.dim

        for i in rel:        
            doc = tfdocs[i]
            for t, f in doc.tfreq.items():
                if t not in pos:
                    pos[t] = 0
                pos[t] += f / len(rel)

        for i in irrel:
            doc = tfdocs[i]
            for t, f in doc.tfreq.items():
                if t not in neg:
                    neg[t] = 0
                neg[t] += f / len(irrel)

        for t in range(dim):
            a = self.vec[t] if t in self.vec else 0
            b = pos[t] if t in pos else 0
            c = neg[t] if t in neg else 0

            tmp = R_a * a + R_b * b - R_c * c
            if tmp > 1e-4:
                self.vec[t] = tmp

In [16]:
# test = False

In [23]:
standlexicon = json.load(open(standlexi_name, 'r'))

# print(standlexicon)

def adjust(q):
    #negative
    found = -1
    for w in standlexicon['negative']:
        index = vocab[w]
        if index in q.vec:
            q.vec[index] *= 1.2
            found = index
            break
    if found != -1:
        for w in standlexicon['negative']:
            index = vocab[w]
            if index != found:
                q.vec[index] = 0.33
        return
    
    #positive
    found = -1
    for w in standlexicon['positive']:
        index = vocab[w]
        if index in q.vec:
            q.vec[index] *= 1.2
            found = index
            break
    if found != -1:
        for w in standlexicon['positive']:
            index = vocab[w]
            if index != found:
                q.vec[index] = 0.33
        return

In [24]:
if not test:
    num_train = 20
    train_scores = {}
    train_qlist = []
    with open(training_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                q, d, score = fields

                score = int(score)

                if q not in train_scores:
                    train_scores[q] = {d:score}                
                    train_qlist.append(Query("train_{:03d}".format(len(train_qlist)), q, vocab))
                else:
                    train_scores[q][d] = score 
    # print("done")
    train_qlist = train_qlist[:num_train]
    print(len(train_qlist))

Query,News_Index,Relevance

20


In [25]:
if not test:
    mean = 0

    for q in train_qlist:        
        scores = []            
        for d in tfdocs.values():
            scores.append((d.doc_id, q.match_2(d.tfreq)))
            
########### feedback
        scores = sorted(scores, key=lambda x: -x[1])        
        
        adjust(q)        
        q.feedback([d for d, s in scores[:MAXCAND]], [d for d, s in scores[-MAXCAND:]], tfdocs)
        
        
        scores = []            
        for d in tfdocs.values():
            scores.append((d.doc_id, q.match(d.tfreq)))  
########### end            

        scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]

        myrank = [train_scores[q.text][d] if d in train_scores[q.text] else 0 for d, s in scores]
        perfrank = sorted(myrank)[::-1]

        my_dcg = 0
        for i,r in enumerate(myrank):
            if i == 0:
                my_dcg += r
            else:
                my_dcg += r/math.log(i+1, 2)

        perf_dcg = 0
        for i,r in enumerate(perfrank):
            if i == 0:
                perf_dcg += r
            else:
                perf_dcg += r/math.log(i+1, 2)

        cur = my_dcg/perf_dcg
        print(cur)
        mean += cur

    print("[NDCG]", mean / len(train_qlist))

0.575504230740311
0.6036903795433055
0.6733874672136638
0.5199861006972385
0.5957414942795558
0.6698924800999123
0.6211656671104095
0.6881379994190577
0.7004741618140462
0.8003826045757819
0.5125136037479151
0.8143218599558374
0.5713745567540826
0.8070123619642946
0.7847615450816575
0.5158764552272019
0.3141127984112516
0.9100019823285697
0.6890156995883306
0.40706759846147117
[NDCG] 0.6387210523506949


In [20]:
# test = True

In [21]:
# load queries
if test:
    qlist = []

    with open(query_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                qlist.append(Query(fields[0], fields[1], vocab))

In [22]:
if test:
    with open(outcsv_name, 'w') as writer:
        writer.write("Query_Index")
        for i in range(MAXCAND):
            writer.write(",Rank_{:03d}".format(i+1))
    #     writer.write("\n")


        for j,q in enumerate(qlist):
            print("Query {}".format(j+1))
            
            scores = []
            for d in tfdocs.values():
                scores.append((d.doc_id, q.match_2(d.tfreq)))
            
########### feedback
            scores = sorted(scores, key=lambda x: -x[1])
    
            adjust(q)            
            q.feedback([d for d, s in scores[:MAXCAND]], [d for d, s in scores[-MAXCAND:]], tfdocs)

            scores = []            
            for d in tfdocs.values():
                scores.append((d.doc_id, q.match(d.tfreq)))  
########### end    

            scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]
            assert scores[0][1] > scores[-1][1]

            writer.write('\n'+q.qid)
            for doc_id, s in scores:
                writer.write(','+doc_id)