In [1]:
import json, csv
import math
import os
import numpy as np
import mafan
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

In [2]:
# !pip install --user mafan

In [3]:
inverted_filename = 'inverted_file.json'
url2content_name = 'url2content.json'
url2title_name = 'url2titles.json'
standlexi_name = 'extern/lexicon_mixed.json'

doc2url_name = 'news_data_1/NC_1.csv'
training_name = 'news_data_1/TD.csv'
query_name = 'news_data_1/QS_1.csv'

outcsv_name = 'out.csv'

In [4]:
num_threads = 4
MAXCAND = 300

Okapi_k1 = 2.
Okapi_b = 0.75
Okapi_k3 = 500.


IDF_epsilon = 1e-4

In [5]:
test = False

In [6]:
Ngram_weights = {}#{l:0. for l in range(1, 25)}
Ngram_weights[1] = 1.
Ngram_weights[2] = 1.04 #1.04: 5886
Ngram_weights[3] = 0.4
# Ngram_weights[4] = 0.4
# Ngram_weights[5] = 1.
MAXGRAM = max(Ngram_weights.keys())
print(MAXGRAM)

3


In [7]:
class Document:
    def __init__(self, doc_id):
        self.doc_id = ''
        self.url = ''
        self.tfreq = {}
        self.id = doc_id
        self.length = 0
        self.normalized = False
    def normalize(self, avgdl, IDF):
        for t, f in self.tfreq.items():
            TF = (Okapi_k1+1.)*f
            dlen_norm = Okapi_k1*(1. - Okapi_b + Okapi_b * (self.length/avgdl)) + f
            self.tfreq[t] = TF/dlen_norm * IDF[t]
            assert(self.tfreq[t] > 0)
        self.normalized = True
    def update(self, term_id, tf):
        if term_id in self.tfreq:
            self.tfreq[term_id] += tf
        else:
            self.tfreq[term_id] = tf
    def getFileSize(self, contents):
        self.length = len(contents[self.url])
        return self.length
    def update_title(self, vocab):
        r = requests.get(self.url)
        soup = BeautifulSoup(r.content)
        if soup.title == None:
            print('[warning]', self.doc_id, 'has no title!')
        else:
            title = str(soup.title).replace('<title>', '').replace('</title>', '')
            self._process(title, vocab, 1.)
    def _process(self, text, voc, weight):
        size = len(text)
        for start in range(size):
            for ngram in range(1, MAXGRAM+1):
                if Ngram_weights[ngram] <= 0 or ngram > size:
                    break
                    
                end = start + ngram
                word = text[start:end]
                
                if word in voc:
                    index = voc[word]
                    self.update(index, weight * Ngram_weights[ngram])

In [8]:
def utf8len(s):
    return len(s.encode('utf-8'))
def getngram(w):
    # check if it's chinese
    is_eng = mafan.text.contains_latin(word)
    return 99 if is_eng else len(word)

In [9]:
# load documents
urlcontents = json.load(open(url2content_name, 'r'))
tfdocs = {}
with open(doc2url_name, 'r') as f:
    for i, line in enumerate(f):
        if i == 0:
            print(line)
        else:
            fields = line.strip().split(',') # doc_id, url
            doc = Document(i-1)
            doc.doc_id = fields[0]
            doc.url = fields[1]
            doc.length = utf8len(urlcontents[fields[1]])
            tfdocs[fields[0]] = doc
DOC_SZ = len(tfdocs) 

News_Index,News_URL



In [10]:
inverted = json.load(open(inverted_filename, 'r'))

In [11]:
# load vocab
make_vocab = True
vocab_name = 'vocab.json'

if make_vocab:
    vocab = {}
    IDF = {}
    for (word, voc) in inverted.items():
        ngram = getngram(word)
        if ngram not in Ngram_weights:
            continue
        i = len(vocab)
        vocab[word] = i
#         idf = voc['idf']
#         N = DOC_SZ / idf
#         assert N <= DOC_SZ
#         IDF[i] = max(IDF_epsilon, math.log((DOC_SZ - N + 0.5)/(N + 0.5)))
    json.dump(vocab, open(vocab_name, 'w'))
else:
    vocab = json.load(open(vocab_name, 'r'))
    IDF = {}

VOC_SZ = len(vocab)
print(VOC_SZ)

186378


In [12]:
#load raw tf
for i, (word, voc) in tqdm(enumerate(inverted.items()), total=len(inverted)):
    ngram = getngram(word)
    if word not in vocab:
        continue
        
    idf = voc['idf']
    N = DOC_SZ / idf
    assert N <= DOC_SZ
    IDF[vocab[word]] = max(IDF_epsilon, math.log((DOC_SZ - N + 0.5)/(N + 0.5)))
    
    term_id = vocab[word]
    for pair in voc['docs']:
        for docname, tf in pair.items():           
            tfdocs[docname].update(term_id, tf*Ngram_weights[ngram]);

HBox(children=(IntProgress(value=0, max=217118), HTML(value='')))




In [13]:
urltitles = json.load(open(url2title_name, 'r'))
for d in tqdm(tfdocs.values()):
    title = urltitles[d.url].strip()
    if title == '':
        #print('[warning]', d.doc_id, 'has no title !')
        pass
    else:
        d._process(title, vocab, 1.)
        d.length += utf8len(title)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [14]:
# normalize docs
avgdl = 0.
for d in tfdocs.values():
    avgdl += d.length
avgdl /= DOC_SZ

for d in tqdm(tfdocs.values()):
    d.normalize(avgdl, IDF)
for d in tfdocs.values():
    assert d.normalized
print(avgdl)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


2177.24542


In [15]:
R_a = 0.75
R_b = 0.15
R_c = 0.1

In [16]:
class Query:
    def __init__(self, qid, text, voc):
        self.qid = qid
        self.vec = {}
        self.dim = len(voc)
        self.id = int(qid[-2:])
        self.length = 0
        self.text = text
        
        self._process(text, voc, 1.)
        self.normalize()
    def match(self, doc_freq):
        out = 0.
        for t, f in doc_freq.items():
            if t in self.vec:
                out += self.vec[t] * f
        return out
    def match_2(self, doc_freq):
        out = 0.
        for t, f in self.vec.items():
            if t in doc_freq:
                out += f * doc_freq[t]
        return out
    def normalize(self):
        for t, f in self.vec.items():
            self.vec[t] = (Okapi_k3+1.)*self.vec[t] / (Okapi_k3+self.vec[t])
        
    def _process(self, text, voc, weight):
        size = len(text)
        for start in range(size):
            for ngram in range(1, MAXGRAM+1):
                if Ngram_weights[ngram] <= 0 or ngram > size:
                    break
                    
                end = start + ngram
                word = text[start:end]
                
                if word in voc:
                    index = voc[word]
                    prev = self.vec[index] if index in self.vec else 0
                    self.vec[index] = prev + weight * Ngram_weights[ngram]
   
    def feedback(self, rel, irrel, tfdocs):
        pos = {}
        neg = {}
        dim = self.dim

        for i in rel:        
            doc = tfdocs[i]
            for t, f in doc.tfreq.items():
                if t not in pos:
                    pos[t] = 0
                pos[t] += f / len(rel)

        for i in irrel:
            doc = tfdocs[i]
            for t, f in doc.tfreq.items():
                if t not in neg:
                    neg[t] = 0
                neg[t] += f / len(irrel)

        for t in range(dim):
            a = self.vec[t] if t in self.vec else 0
            b = pos[t] if t in pos else 0
            c = neg[t] if t in neg else 0

            tmp = R_a * a + R_b * b - R_c * c
            if tmp > 1e-4:
                self.vec[t] = tmp

In [17]:
# test = False

In [47]:
standlexicon = json.load(open(standlexi_name, 'r'))
standlexicon['negative'] = [w for w in standlexicon['negative'] if w in vocab]
standlexicon['positive'] = [w for w in standlexicon['positive'] if w in vocab]
# print(standlexicon)

def adjust(q):
    qneg = ['反對',  '拒絕',  '不應',  '錯誤',  '不可', '不贊同', '不贊成', '不合理',
            '不對',  '不支持', '不同意', '不應該', '不正確','不可以','不合法' ]
    qpos = ['支持', '同意', '應該', '正確', '可以', '贊同', '贊成', '合理', '合法', '不反對', '對的', '應', '可', '有']
    
    stand = None
    key = None
    for s in qneg:
        if s in q.text:
            print(q.text, '[neg]')
            stand = 'neg'
            key = s
            break
    if stand == None:
        for s in qpos:
            if s in q.text:
                print(q.text, '[pos]')
                stand = 'pos'
                key = s
                break
    
    if stand == 'neg':
        if key in vocab:
            index = vocab[key]
            q.vec[index] *= 1.2
        for w in standlexicon['negative']:
            if w != key:
                index = vocab[w]            
                q.vec[index] = 0.33
    elif stand == 'pos':
        if key in vocab:
            index = vocab[key]
            q.vec[index] *= 1.2
        for w in standlexicon['positive']:
            if w != key:
                index = vocab[w]            
                q.vec[index] = 0.33
    else:
        print(q.text, '[none]')

In [48]:
if not test:
    num_train = 20
    train_scores = {}
    train_qlist = []
    with open(training_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                q, d, score = fields

                score = int(score)

                if q not in train_scores:
                    train_scores[q] = {d:score}                
                    train_qlist.append(Query("train_{:03d}".format(len(train_qlist)), q, vocab))
                else:
                    train_scores[q][d] = score 
    # print("done")
    train_qlist = train_qlist[:num_train]
    print(len(train_qlist))

Query,News_Index,Relevance

20


In [None]:
if not test:
    mean = 0

    for q in train_qlist:
        scores = []            
        for d in tfdocs.values():
            scores.append((d.doc_id, q.match_2(d.tfreq)))
            
########### feedback
        scores = sorted(scores, key=lambda x: -x[1])        
        
        adjust(q)        
        q.feedback([d for d, s in scores[:MAXCAND]], [d for d, s in scores[-MAXCAND:]], tfdocs)
        
        
        scores = []            
        for d in tfdocs.values():
            scores.append((d.doc_id, q.match(d.tfreq)))  
########### end            

        scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]

        myrank = [train_scores[q.text][d] if d in train_scores[q.text] else 0 for d, s in scores]
        perfrank = sorted(myrank)[::-1]

        my_dcg = 0
        for i,r in enumerate(myrank):
            if i == 0:
                my_dcg += r
            else:
                my_dcg += r/math.log(i+1, 2)

        perf_dcg = 0
        for i,r in enumerate(perfrank):
            if i == 0:
                perf_dcg += r
            else:
                perf_dcg += r/math.log(i+1, 2)

        cur = my_dcg/perf_dcg
        print(cur)
        mean += cur

    print("[NDCG]", mean / len(train_qlist))

支持陳前總統保外就醫 [pos]
0.5757201239264498
年金改革應取消或應調降軍公教月退之優存利率十八趴 [pos]
0.5844330101457207
同意動物實驗 [pos]
0.6701215770637953
油價應該凍漲或緩漲 [pos]
0.5399031567289836
反對旺旺中時併購中嘉 [neg]
0.5990541903993526
另立專法保障同婚是正確的 [pos]
0.6385719759009506
反對無圍牆校園 [neg]
0.6210006158369632
國際賽事會場內應該可以持中華民國國旗 [pos]
0.7040066921611015
贊同課綱微調 [pos]
0.6973155009736804
贊成流浪動物零撲殺 [pos]
0.7895936440196538
核四應該啟用 [pos]
0.507705945617895
贊成文林苑都更案可依法拆除王家 [pos]
0.8120092022009424
十二年國教高中職「免學費補助」適用對象增加是不對的 [neg]
0.5586355366193766
堅決反對政府舉債發展前瞻建設計畫 [neg]
0.8072300572594174
遠雄大巨蛋工程應停工或拆除 [pos]
0.7845433747537782
支持正名「臺灣」參與國際運動賽事 [pos]
0.5170271545043451
拒絕公投通過門檻下修 [neg]
0.3176032028368193
應該提高酒駕罰責以有效遏制酒駕 [pos]


In [43]:
test = True

In [41]:
# load queries
if test:
    qlist = []

    with open(query_name, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line)
            else:
                fields = line.strip().split(',') # q_id, text
                qlist.append(Query(fields[0], fields[1], vocab))

Query_Index,Query



In [42]:
if test:
    with open(outcsv_name, 'w') as writer:
        writer.write("Query_Index")
        for i in range(MAXCAND):
            writer.write(",Rank_{:03d}".format(i+1))
    #     writer.write("\n")


        for j,q in enumerate(qlist):
            print("Query {}".format(j+1))
            
            scores = []
            for d in tfdocs.values():
                scores.append((d.doc_id, q.match_2(d.tfreq)))
            
########### feedback
            scores = sorted(scores, key=lambda x: -x[1])
    
            adjust(q)            
            q.feedback([d for d, s in scores[:MAXCAND]], [d for d, s in scores[-MAXCAND:]], tfdocs)

            scores = []            
            for d in tfdocs.values():
                scores.append((d.doc_id, q.match(d.tfreq)))  
########### end    

            scores = sorted(scores, key=lambda x: -x[1])[:MAXCAND]
            assert scores[0][1] > scores[-1][1]

            writer.write('\n'+q.qid)
            for doc_id, s in scores:
                writer.write(','+doc_id)

Query 1
通姦在刑法上應該除罪化 [pos]
Query 2
應該取消機車強制二段式左轉(待轉) [pos]
Query 3
支持博弈特區在台灣合法化 [pos]
Query 4
中華航空空服員罷工是合理的 [pos]
Query 5
性交易應該合法化 [pos]
Query 6
ECFA早收清單可（有）達到其預期成效 [pos]
Query 7
應該減免證所稅 [pos]
Query 8
贊成中油在觀塘興建第三天然氣接收站 [pos]
Query 9
支持中國學生納入健保 [pos]
Query 10
支持臺灣中小學（含高職、專科）服儀規定（含髮、襪、鞋）給予學生自主 [pos]
Query 11
不支持使用加密貨幣 [neg]
Query 12
不支持學雜費調漲 [neg]
Query 13
同意政府舉債發展前瞻建設計畫 [pos]
Query 14
支持電競列入體育競技 [pos]
Query 15
反對台鐵東移徵收案 [neg]
Query 16
支持陳前總統保外就醫 [pos]
Query 17
年金改革應取消或應調降軍公教月退之優存利率十八趴 [pos]
Query 18
同意動物實驗 [pos]
Query 19
油價應該凍漲或緩漲 [pos]
Query 20
反對旺旺中時併購中嘉 [neg]
