In [1]:
import os
import jieba
import jieba.posseg as pseg
import math
import operator
import sqlite3
import pandas as pd
import json
import utils

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

## Preprocess Data

In [2]:
if os.path.exists('../Wiki/'):
    print('Wiki file is exist.')
else:
    # 放置Wiki原始資料的資料夾
    os.mkdir('../Wiki/')
    print('處理Wiki檔案中...')
    wiki_sen = utils.wiki_num_sentence('../wiki-pages')
    wiki_doc = utils.wiki_doc('../wiki-pages')
    wiki_arctext = utils.wiki_arctext_doc('../wiki-pages')
    wiki_numtext = utils.wiki_numtext_doc('../wiki-pages')
    print('處理完成')

處理Wiki檔案中...
../wiki-pages//.DS_Store


ValueError: Expected object or value

In [3]:
def Claim_evidence(DataPath, SavePath):
    claim = pd.read_json(path_or_buf=DataPath, lines=True)
    wiki = pd.read_csv('../Wiki/wiki_clean.csv', keep_default_na=False, na_values=[' '])
    func = lambda x:[y for l in x for y in func(l)] if type(x[0]) == list else [x]
    text = []
    for i in tqdm(claim['evidence']):
        evs = func(i)
        txs = []
        for ev in evs:
            try:
                txs.append(wiki[(wiki['Article']==ev[2]) & (wiki['Num'] == ev[3])].Text.tolist()[0])
            except: pass
        text.append(txs)
    claim['text'] = text
    tem = []
    for s in claim['claim']:
        tem.append(utils.clean_space(s))
    claim['claim'] = tem
    claim.to_csv(SavePath, index=False)
    return claim

if os.path.exists('../data_preprocessing/'):
    print('data preprocessing file is exist.')
else:
    os.mkdir('../data_preprocessing/')
    print('處理訓練檔案中...')
    claim1 = Claim_evidence('./Data/public_train_0316.jsonl', '../data_preprocessing/claim_evidence_train1.csv')
    claim2 = Claim_evidence('./Data/public_train_0522.jsonl', '../data_preprocessing/claim_evidence_train2.csv')
    mix = pd.concat([claim1, claim2], axis=0, ignore_index=True) 
    mix.to_csv('../data_preprocessing/claim_evidence_train_all.csv', index=False )
    print('處理完成')

100%|███████████████████████████████████████| 3942/3942 [15:40<00:00,  4.19it/s]
100%|███████████████████████████████████████| 7678/7678 [27:25<00:00,  4.67it/s]


## Document Retrieval

In [4]:
def clean_list(seg_list):
    cleaned_dict = {}
    for i in seg_list:
        i = i.strip().lower()
        if i != '' and (i not in stop_words):
            if i in cleaned_dict:
                cleaned_dict[i] = cleaned_dict[i] + 1
            else:
                cleaned_dict[i] = 1
    return cleaned_dict

In [5]:
def fetch_from_db(db, term):
    c = db.cursor()
    c.execute('SELECT * FROM postings WHERE term=?', (term,))
    return(c.fetchone())

In [6]:
def BM25F(sentence):
    K1 = 1.2
    B = 0.75
    N = len(wiki)
    AVG_L = 130
    
    words = pseg.cut(sentence)
    seg_list = [w.word for w in words if (w.flag.startswith('n') or w.flag.startswith('v'))]
    cleaned_dict = clean_list(seg_list)
    BM25_scores = {}
    for term in cleaned_dict.keys():
        r = fetch_from_db(conn, term.lower())
        t = fetch_from_db(cont, term.lower())
        if (r is None) and (t is None):
            continue
        if t is None:
            title = []
        else:
            titles = t[2].split('\n')
            df_t = t[1]
            idf_t = math.log2((N - df_t + 0.5) / (df_t + 0.5)) #idf
            f = idf_t/(df_t + K1)
        try:
            docs = r[2].split('\n')
            df = r[1]
            idf = math.log2((N - df + 0.5) / (df + 0.5)) #idf
            for doc in docs:
                docid, tf, ld = doc.split('\t')
                docid = int(docid)
                tf = int(tf)
                ld = int(ld)
                if str(docid) in titles:
                    s = (K1 * tf * idf) / (tf + K1 * (1 - B + B * ld / AVG_L))
                    s = (s * (K1 + f)) / (f * B)
                else:
                    s = (K1 * tf * idf) / (tf + K1 * (1 - B + B * ld / AVG_L))
                if docid in BM25_scores:
                    BM25_scores[docid] = BM25_scores[docid] + s
                else:
                    BM25_scores[docid] = s
        except:pass
        try:
            for title in titles:
                title = int(title)
                if title not in BM25_scores:
                    BM25_scores[title] = f
        except:pass
    BM25_scores = sorted(BM25_scores.items(), key = operator.itemgetter(1))
    BM25_scores.reverse()
    if len(BM25_scores) == 0:
        return []
    else:
        return BM25_scores

In [7]:
def train_data(DataPath, SavePath):
    train = pd.read_json(path_or_buf=DataPath, lines=True)
    claim = list(train['claim'])
    docs = []
    for sen in tqdm(claim):
        docs.append([i[0] for i in BM25F(sen)[0:5]])
    df_j = train[['id', 'claim', 'label']]
    '''
    0:supports
    1:refutes
    2:NOT ENOUGH INFO
    '''
    df_j.loc[df_j['label'] == 'supports', 'label' ] = 0
    df_j.loc[df_j['label'] == 'refutes', 'label' ] = 1
    df_j.loc[df_j['label'] == 'NOT ENOUGH INFO', 'label' ] = 2
    evidence = []
    for i in docs:
        evs = []
        for ev in i:
            evs.append({ "Article":wiki['Article'][ev], "Text":eval(wiki['Text'][ev])})
        evidence.append(dict(enumerate(evs)))
    claim_data = df_j[['id', 'claim', 'label']].to_dict('records')
    for i in range(len(claim_data)):
        claim_data[i].update({'evidence':evidence[i]})
    with open(SavePath, "w", encoding='utf-8') as outfile:
#         json.dump(claim_data, outfile, indent=4, ensure_ascii=False)
        json.dump(claim_data, outfile, ensure_ascii=False)
    return claim_data

In [8]:
def test_data(DataPath, SavePath):
    test = pd.read_json(path_or_buf=DataPath, lines=True)
    claim = list(test['claim'])
    docs = []
    for sen in tqdm(claim):
        docs.append([i[0] for i in BM25F(sen)[0:5]])
    df_j = test
    df_j['label'] = 2
    evidence = []
    for i in docs:
        evs = []
        if i == []:
            evs.append({"Article":'None', "Text":[]})
        else:
            for ev in i:
                evs.append({"Article":wiki['Article'][ev], "Text":eval(wiki['Text'][ev])})
        evidence.append(dict(enumerate(evs)))
    claim_data = df_j[['id', 'claim', 'label']].to_dict('records')
    for i in range(len(claim_data)):
        claim_data[i].update({'evidence':evidence[i]})
    with open(SavePath, "w", encoding='utf-8') as outfile:
        json.dump(claim_data, outfile, ensure_ascii=False)
    return claim_data

In [9]:
if __name__ == "__main__":
    conn = sqlite3.connect('../ir.db')
    cont = sqlite3.connect('../ir_title.db')
    
    if os.path.exists('./userdict.txt'):
        jieba.load_userdict('./userdict.txt')
    else:
        utils.new_userdict('../Wiki/wiki_clean_doc.csv', './userdict.txt')
        jieba.load_userdict('./userdict.txt')
    f = open('./stopwords.txt', encoding = 'utf-8')
    words = f.read()
    stop_words = set(words.split('\n'))
    wiki = pd.read_csv('../Wiki/wiki_clean_numtext_doc.csv', keep_default_na=False, na_values=[' '])
    train_0316 = train_data('./Data/public_train_0316.jsonl', './Result/claim_train_BM25F_v3.json')
    train_0522 = train_data('./Data/public_train_0522.jsonl', './Result/claim_train2_BM25F_v3.json')
    test_public = test_data('./Data/public_test_data.jsonl', './Result/claim_test_BM25F_v3.json')
    test_private = test_data('./Data/private_test_data.jsonl', './Result/claim_private_test_BM25F_v3.json')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/46/bn7t4hx56ws0wqtm45j0m6_r0000gn/T/jieba.cache
Loading model cost 0.562 seconds.
Prefix dict has been built successfully.
100%|█████████████████████████████████████████| 989/989 [02:12<00:00,  7.48it/s]
