In [1]:
'''
https://github.com/01joy/news-search-engine/tree/master
https://github.com/isnowfy/snownlp/blob/master/snownlp/sim/bm25.py
'''

import jieba
import jieba.posseg as pseg
import math
import operator
import sqlite3
import pandas as pd
from tqdm import tqdm
import json
import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
jieba.load_userdict('./Doc_retrieval/userdict.txt')
f = open('./Doc_retrieval/stopwords.txt', encoding = 'utf-8')
words = f.read()
stop_words = set(words.split('\n'))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/46/bn7t4hx56ws0wqtm45j0m6_r0000gn/T/jieba.cache
Loading model cost 0.460 seconds.
Prefix dict has been built successfully.


In [3]:
wiki = pd.read_csv('./Wiki/wiki_clean_numtext_doc.csv', keep_default_na=False, na_values=[' '])
# wiki = pd.read_csv('./Wiki/wiki_clean_doc.csv', keep_default_na=False, na_values=[' '], encoding='utf-8')

In [4]:
def clean_list(seg_list):
    cleaned_dict = {}
    for i in seg_list:
        i = i.strip().lower()
        if i != '' and (i not in stop_words):
            if i in cleaned_dict:
                cleaned_dict[i] = cleaned_dict[i] + 1
            else:
                cleaned_dict[i] = 1
    return cleaned_dict

def fetch_from_db(db, term):
    c = db.cursor()
    c.execute('SELECT * FROM postings WHERE term=?', (term,))
    return(c.fetchone())

In [5]:
conn = sqlite3.connect('./Doc_retrieval/ir.db') #V1~V3
cont = sqlite3.connect('./Doc_retrieval/ir_title.db') #V1~V3
# conn = sqlite3.connect('./Doc_retrieval/ir_withENG.db') #V4
# cont = sqlite3.connect('./Doc_retrieval/ir_title_lower.db') #V4
# K1 = 1.2
# # K1 = 1.5
# B = 0.75
# N = len(wiki)
# AVG_L = 32
# # AVG_L = 130

In [6]:
'''
V2
K1 = 1.2
B = 0.75
N = len(wiki)
AVG_L = 32
======================
V3
K1 = 1.2
B = 0.75
N = len(wiki)
AVG_L = 130
======================
V4
K1 = 1.2
B = 0.75
N = len(wiki)
AVG_L = 130
'''
K1 = 1.2
B = 0.75
N = len(wiki)
AVG_L = 130
def BM25F(sentence):
    words = pseg.cut(sentence)
    seg_list = [w.word for w in words if (w.flag.startswith('n') or w.flag.startswith('v'))] # V1~V3
#     seg_list = [w.word for w in words if (w.flag.startswith('n') or w.flag.startswith('v') or w.flag.startswith('eng'))] # V4
    cleaned_dict = clean_list(seg_list)
    BM25_scores = {}
    for term in cleaned_dict.keys():
        r = fetch_from_db(conn, term.lower())
        t = fetch_from_db(cont, term.lower())
        if (r is None) and (t is None):
            continue
        if t is None:
            title = []
        else:
            titles = t[2].split('\n')
            df_t = t[1]
            idf_t = math.log2((N - df_t + 0.5) / (df_t + 0.5)) #idf
            f = idf_t/(df_t + K1)
        try:
            docs = r[2].split('\n')
            df = r[1]
            idf = math.log2((N - df + 0.5) / (df + 0.5)) #idf
            for doc in docs:
                docid, tf, ld = doc.split('\t')
                docid = int(docid)
                tf = int(tf)
                ld = int(ld)
                if str(docid) in titles:
                    s = (K1 * tf * idf) / (tf + K1 * (1 - B + B * ld / AVG_L))
                    s = (s * (K1 + f)) / (f * B)
                else:
                    s = (K1 * tf * idf) / (tf + K1 * (1 - B + B * ld / AVG_L))
                if docid in BM25_scores:
                    BM25_scores[docid] = BM25_scores[docid] + s
                else:
                    BM25_scores[docid] = s
        except:pass
        try:
            for title in titles:
                title = int(title)
#                 f = idf_t/(df_t + K1)
#                 f = idf_t/(df_t)
                if title not in BM25_scores:
                    BM25_scores[title] = f
        except:pass
    BM25_scores = sorted(BM25_scores.items(), key = operator.itemgetter(1))
    BM25_scores.reverse()
    if len(BM25_scores) == 0:
        return []
#         return 0, []
    else:
        return BM25_scores
#         return 1, BM25_scores

In [7]:
# '''
# V1
# K1 = 1.5
# B = 0.75
# N = len(wiki)
# AVG_L = 130
# '''
# K1 = 1.5
# B = 0.75
# N = len(wiki)
# AVG_L = 130

# def BM25F(sentence):
#     words = pseg.cut(sentence)
#     seg_list = [w.word for w in words if (w.flag.startswith('n') or w.flag.startswith('v'))]
#     cleaned_dict = clean_list(seg_list)
#     BM25_scores = {}
#     for term in cleaned_dict.keys():
#         r = fetch_from_db(conn, term)
#         t = fetch_from_db(cont, term)
#         if r and t is None:
#             continue
#         try:
#             titles = t[2].split('\n')
#             df_t = t[1]
#             idf_t = math.log2((N - df_t + 0.5) / (df_t + 0.5)) #idf
#             for title in titles:
#                 title = int(title)
#                 f = idf_t/(df_t)
#                 if title in BM25_scores:
#                     BM25_scores[title] = BM25_scores[title] + f
#                 else:
#                     BM25_scores[title] = f
#         except:pass
#         try:
#             df = r[1]
#             idf = math.log2((N - df + 0.5) / (df + 0.5)) #idf
#             docs = r[2].split('\n')
#             for doc in docs:
#                 docid, tf, ld = doc.split('\t')
#                 docid = int(docid)
#                 tf = int(tf)
#                 ld = int(ld)
#                 s = (K1 * tf * idf) / (tf + K1 * (1 - B + B * ld / AVG_L))
#                 if docid in BM25_scores:
#                     BM25_scores[docid] = BM25_scores[docid] + s
#                 else:
#                     BM25_scores[docid] = s
#         except:pass
#     BM25_scores = sorted(BM25_scores.items(), key = operator.itemgetter(1))
#     BM25_scores.reverse()
#     if len(BM25_scores) == 0:
#         return []
# #         return 0, []
#     else:
#         return BM25_scores
# #         return 1, BM25_scores

## Tarin

In [7]:
# train = pd.read_json(path_or_buf='./訓練資料集/public_train_0316.jsonl', lines=True)
train = pd.read_json(path_or_buf='./訓練資料集/public_train_0522.jsonl', lines=True)

In [8]:
train.head(3)

Unnamed: 0,id,label,claim,evidence
0,14301,NOT ENOUGH INFO,崔維斯·米勒是退役美國職業棒球大聯盟投手，目前于飛機擔任運動指導員。,"[[13414, None, None, None]]"
1,2469,supports,氣象衛星是一種人造衛星，可以呈現地球天氣的變化和大氣特徵。,"[[[2053, 2226, 氣象衛星, 0]]]"
2,7137,supports,在臺灣臺南市東區虎尾寮重劃區裏面的臺南市東區復興國民小學，旁邊有復興國民中學。,"[[[6236, 6059, 臺南市東區復興國民小學, 0]]]"


In [9]:
wiki.head(3)

Unnamed: 0,Article,Text
0,數學,"[[0, '數學，是研究數量、結構以及空間等概念及其變化的一門學科，屬於形式科學的一種。']..."
1,文學,"[[0, '文學（literature），在狹義上，是一種語言藝術，亦即使用語言文字爲手段，..."
2,哲學,"[[0, '哲學（philosophy）是研究普遍的、基本問題的學科，包括存在、知識、價值、..."


In [10]:
claim = list(train['claim'])

In [11]:
docs = []
for sen in tqdm(claim):
    docs.append([i[0] for i in BM25F(sen)[0:5]])
#     docs.append([wiki['Article'][i[0]] for i in BM25F(sen)[0:5]])

100%|███████████████████████████████████████| 7678/7678 [19:01<00:00,  6.73it/s]


In [17]:
# with open('./Doc_retrieval/train_BM25F_K12_L32.pickle', 'wb') as f:
#     pickle.dump(docs, f)

In [24]:
# with open('./Data/train_high5_article_epoch50.pickle', 'rb') as f:
#     data = pickle.load(f)

In [25]:
# uni = []
# for i in range(len(data)):
# #     uni += [list(set(docs[i]).union(data[i]))]
#     tem = []
#     for d in list(set(docs[i]).union(data[i])):
#         if eval(wiki['Text'][d]) != []:
#             tem.append(d)
#     uni.append(tem)
# docs = uni

In [19]:
df_j = train[['id', 'claim', 'label']]

In [20]:
'''
0:supports
1:refutes
2:NOT ENOUGH INFO
'''
df_j.loc[df_j['label'] == 'supports', 'label' ] = 0
df_j.loc[df_j['label'] == 'refutes', 'label' ] = 1
df_j.loc[df_j['label'] == 'NOT ENOUGH INFO', 'label' ] = 2

In [21]:
evidence = []
for i in docs:
    evs = []
    for ev in i:
        evs.append({ "Article":wiki['Article'][ev], "Text":eval(wiki['Text'][ev])})
    evidence.append(dict(enumerate(evs)))

In [22]:
claim_data = df_j[['id', 'claim', 'label']].to_dict('records')

In [23]:
for i in range(len(claim_data)):
    claim_data[i].update({'evidence':evidence[i]})

In [24]:
with open("./Data/claim_train2_BM25F_v4.json", "w", encoding='utf-8') as outfile:
#     json.dump(claim_data, outfile, indent=4, ensure_ascii=False)
    json.dump(claim_data, outfile, ensure_ascii=False)

## Test

In [7]:
# test = pd.read_json(path_or_buf='./predict/public_test_data.jsonl', lines=True)
test = pd.read_json(path_or_buf='./private/private_test_data.jsonl', lines=True)

In [8]:
test.head(3)

Unnamed: 0,id,claim
0,21498,雞形目的鳥腿腳強健，擅長在地面奔跑，其中有珍稀物種，體態雄健優美、顏色鮮豔；也有經濟物種，與...
1,13037,教會剛建立時為解決內部的一些問題，使徒們寫下許多便條，其中有八卷不是保羅寫的為大公書信。
2,18652,羅伯·昆蘭於明尼蘇達州聖保羅市出生。


In [9]:
claim = list(test['claim'])

In [None]:
docs = []
for sen in tqdm(claim):
    docs.append([i[0] for i in BM25F(sen)[0:5]])
#     docs.append([wiki['Article'][i[0]] for i in BM25F(sen)[0:5]])

  1%|▎                                        | 50/8049 [00:09<29:24,  4.53it/s]

In [16]:
df_j = test
df_j['label'] = 2

In [17]:
evidence = []
for i in docs:
    evs = []
    if i == []:
        evs.append({"Article":'None', "Text":[]})
    else:
        for ev in i:
            evs.append({"Article":wiki['Article'][ev], "Text":eval(wiki['Text'][ev])})
    evidence.append(dict(enumerate(evs)))

In [18]:
claim_data = df_j[['id', 'claim', 'label']].to_dict('records')

In [19]:
for i in range(len(claim_data)):
    claim_data[i].update({'evidence':evidence[i]})

In [20]:
with open("./Predict/claim_test_BM25F_v3.json", "w", encoding='utf-8') as outfile:
# with open("./Private/claim_private_test_BM25F_v3.json", "w", encoding='utf-8') as outfile:
#     json.dump(claim_data, outfile, indent=4, ensure_ascii=False)
    json.dump(claim_data, outfile, ensure_ascii=False)

## Check

In [12]:
func = lambda x:[y for l in x for y in func(l)] if type(x)==list else [x]

evidence = []
for i in train['evidence']:
    ev_title = []
    evs = func(i)
    for ev in evs:
        if type(ev) == str:
            ev_title.append(ev)
    evidence.append(list(set(ev_title)))

In [16]:
docs2 = []
for i in docs:
    temp = []
    for d in i:
        a = wiki['Article'][d]
        temp.append(a)
    docs2.append(temp)

In [17]:
not_catch = []
for i in range(len(evidence)):
    tem = []
#     if train['label'][i] != 'NOT ENOUGH INFO':
    for ev in evidence[i]:
        if ev not in docs2[i]:
            tem.append(ev)
    not_catch.append(tem)

In [15]:
a = 0
for i in train['label']:
    if i != 'NOT ENOUGH INFO':
        a += 1
a

5131

In [16]:
b = 0
for i in not_catch:
    if i == []:
        b+=1
print(b/a)

0.9056714090820502


In [17]:
check = test[['claim']]

In [30]:
check['evi'] = evidence
check['notcatch'] = not_catch

In [18]:
check['v3'] = docs2

In [19]:
check.to_csv('check.csv', index=False )