# Baseline
python: 3.8.*

use ```Ctrl + ]``` to collapse all section :)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !gdown --folder 1T6jpOtdf_i6XNYA6F_lqU4mRRh1xYPcl
# !mv baseline/* ./

!cp -r /content/drive/MyDrive/ShareDesktop/政大資管所/_課程/自然語言處理/final/report/* ./
# !cp -r /content/drive/MyDrive/ShareDesktop/政大資管所/_課程/自然語言處理/final/baseline/data2/wiki-pages ./

In [3]:
%pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting ipywidgets==8.0.5 (from -r requirements.txt (line 3))
  Downloading ipywidgets-8.0.5-py3-none-any.whl (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandarallel==1.6.4 (from -r requirements.txt (line 4))
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboard==2.11.0 (from -r requirements.txt (line 7))
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==1.13.1+cu116 (from -r requirements.txt (line 8))
  Downloading https://download.pytorch.org/whl/cu116/torch-1.13.1%2Bcu116-cp310-cp310-linux_x86_64.

notebook1
# PART 1. Document retrieval

In [4]:
# built-in libs
import json
import pickle
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union

# 3rd party libs
import hanlp
import opencc
import pandas as pd
import wikipedia
from hanlp.components.pipeline import Pipeline
from pandarallel import pandarallel

# our own libs
from utils import load_json

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=10)
wikipedia.set_lang("zh")

top_n_doc = 10

In [5]:
CONVERTER_T2S = opencc.OpenCC("t2s.json")
CONVERTER_S2T = opencc.OpenCC("s2t.json")

In [6]:
@dataclass
class Claim:
    data: str

@dataclass
class AnnotationID:
    id: int

@dataclass
class EvidenceID:
    id: int

@dataclass
class PageTitle:
    title: str

@dataclass
class SentenceID:
    id: int

@dataclass
class Evidence:
    data: List[List[Tuple[AnnotationID, EvidenceID, PageTitle, SentenceID]]]

In [7]:
def do_st_corrections(text: str) -> str:
    simplified = CONVERTER_T2S.convert(text)

    return CONVERTER_S2T.convert(simplified)

In [8]:
def get_nps_hanlp(
    predictor: Pipeline,
    d: Dict[str, Union[int, Claim, Evidence]],
) -> List[str]:
    claim = d["claim"]
    tree = predictor(claim)["con"]
    nps = [
        do_st_corrections("".join(subtree.leaves()))
        for subtree in tree.subtrees(lambda t: t.label() == "NP")
    ]

    return nps

In [9]:
def calculate_precision(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    precision = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        # Extract all ground truth of titles of the wikipedia pages
        # evidence[2] refers to the title of the wikipedia page
        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])

        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        if len(predicted_pages) != 0:
            precision += len(hits) / len(predicted_pages)

        count += 1

    # Macro precision
    print(f"Precision: {precision / count}")


def calculate_recall(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    recall = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])
        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        recall += len(hits) / len(gt_pages)
        count += 1

    print(f"Recall: {recall / count}")

In [10]:
def save_doc(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
    mode: str = "train",
) -> None:
    with open(
        f"data/{mode}_doc{top_n_doc}.jsonl",
        "w",
        encoding="utf8",
    ) as f:
        for i, d in enumerate(data):
            d["predicted_pages"] = list(predictions.iloc[i])
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

In [11]:
def get_pred_pages(series_data: pd.Series) -> Set[Dict[int, str]]:
    results = []
    tmp_muji = []
    # wiki_page: its index showned in claim
    mapping = {}
    claim = series_data["claim"]
    nps = series_data["hanlp_results"]
    first_wiki_term = []

    for i, np in enumerate(nps):
        # Simplified Traditional Chinese Correction
        wiki_search_results = [
            do_st_corrections(w) for w in wikipedia.search(np)
        ]

        # Remove the wiki page's description in brackets
        wiki_set = [re.sub(r"\s\(\S+\)", "", w) for w in wiki_search_results]
        wiki_df = pd.DataFrame({
            "wiki_set": wiki_set,
            "wiki_results": wiki_search_results
        })

        # Elements in wiki_set --> index
        # Extracting only the first element is one way to avoid extracting
        # too many of the similar wiki pages
        grouped_df = wiki_df.groupby("wiki_set", sort=False).first()
        candidates = grouped_df["wiki_results"].tolist()
        # muji refers to wiki_set
        muji = grouped_df.index.tolist()

        for prefix, term in zip(muji, candidates):
            if prefix not in tmp_muji:
                matched = False

                # Take at least one term from the first noun phrase
                if i == 0:
                    first_wiki_term.append(term)

                # Walrus operator :=
                # https://docs.python.org/3/whatsnew/3.8.html#assignment-expressions
                # Through these filters, we are trying to figure out if the term
                # is within the claim
                if (((new_term := term) in claim) or
                    ((new_term := term.replace("·", "")) in claim) or
                    ((new_term := term.split(" ")[0]) in claim) or
                    ((new_term := term.replace("-", " ")) in claim)):
                    matched = True

                elif "·" in term:
                    splitted = term.split("·")
                    for split in splitted:
                        if (new_term := split) in claim:
                            matched = True
                            break

                if matched:
                    # post-processing
                    term = term.replace(" ", "_")
                    term = term.replace("-", "")
                    results.append(term)
                    mapping[term] = claim.find(new_term)
                    tmp_muji.append(new_term)

    # 5 is a hyperparameter
    if len(results) > top_n_doc:
        assert -1 not in mapping.values()
        results = sorted(mapping, key=mapping.get)[:top_n_doc]
    elif len(results) < 1:
        results = first_wiki_term

    return set(results)

google help function

In [12]:
from googlesearch import search
from urllib.parse import unquote
from tqdm import tqdm
from time import sleep

def google_search(query_list):
  # 指定要搜尋的網域
  site = " site:https://zh.wikipedia.org/"
  pages = []

  # 執行 Google 搜尋並設定搜尋結果的數量
  for query in query_list:
    search_results = search(query+site, num=10, lang='zh')

    # 迭代搜尋結果
    for i, result in enumerate(search_results):
      page = unquote(result.split('/')[-1])
      page = do_st_corrections(page)
      pages.append(page)
      if i >= 10-1:
        break
  
  sleep(1.5)
  return pages


def macro_doc_recall(data, predicted_pages):
    doc_recall = 0
    N = 0
    for d, pages in zip(data, predicted_pages):
        pages_set = set(pages)

        if d['label'] != "NOT ENOUGH INFO":
            N += 1
            # doc
            gt = []
            for evidence in d['evidence']:
                for sent in evidence:
                    gt.append(sent[2])
            gt = set(gt)
            doc_recall += len(gt.intersection(pages_set)) / len(gt)

    doc_recall /= N    
    print(f"Doc Recall: {doc_recall:.4f}")
    return doc_recall

### Step 1. Get noun phrases from hanlp consituency parsing tree

Setup [HanLP](https://github.com/hankcs/HanLP) predictor (1 min)

We will skip this process which for creating parsing tree when demo on class

In [13]:
%%time
train_data_name = 'public_train'
TRAIN_DATA = load_json(f"data/{train_data_name}.jsonl")

# train_data_name = 'public_train_0522'
# TRAIN_DATA = load_json(f"data/{train_data_name}.jsonl")
hanlp_file = f"data/hanlp_con_results_{train_data_name}.pkl"

if Path(hanlp_file).exists():
    with open(hanlp_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    predictor = (hanlp.pipeline().append(
        hanlp.load("FINE_ELECTRA_SMALL_ZH"),
        output_key="tok",
    ).append(
        hanlp.load("CTB9_CON_ELECTRA_SMALL"),
        output_key="con",
        input_key="tok",
    ))
    
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TRAIN_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)

Downloading https://file.hankcs.com/hanlp/tok/fine_electra_small_20220615_231803.zip to /root/.hanlp/tok/fine_electra_small_20220615_231803.zip
Decompressing /root/.hanlp/tok/fine_electra_small_20220615_231803.zip to /root/.hanlp/tok
Downloading https://file.hankcs.com/hanlp/utils/char_table_20210602_202632.json.zip to /root/.hanlp/utils/char_table_20210602_202632.json.zip
Decompressing /root/.hanlp/utils/char_table_20210602_202632.json.zip to /root/.hanlp/utils
Downloading https://file.hankcs.com/hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip
Decompressing /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers
Downloading https://file.hankcs.com/hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip to /root/.hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip
Decompressing /root/.hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip to /root/.hanlp/

CPU times: user 5.68 s, sys: 1.9 s, total: 7.58 s
Wall time: 50.5 s


In [14]:
train_doc_path = f"./data/{train_data_name}_doc10.pkl"

train_df = pd.DataFrame(TRAIN_DATA)
train_df.loc[:, 'hanlp_results'] = hanlp_results

train_df = train_df.set_index('id')
train_df

Unnamed: 0_level_0,label,claim,evidence,hanlp_results
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2663,refutes,天衛三軌道在天王星內部的磁層，以《 仲夏夜之夢 》作者緹坦妮雅命名。,"[[[4209, 4331, 天衛三, 2]]]","[天衛三軌道在天王星內部的磁層, 天衛三軌道, 天衛, 軌道, 天王星內部, 磁層, 《仲夏..."
2399,refutes,信天翁科的活動範圍位於北冰洋以及南太平洋，牠的翼展可達到3.7米，是世界上現存的翼展最大的鳥類。,"[[[2719, 2928, 信天翁科, 2]]]","[信天翁科的活動範圍, 信天翁科, 活動範圍, 北冰洋以及南太平洋, 北冰洋, 南太平洋, ..."
8075,NOT ENOUGH INFO,F.I.R. 的 團員有主唱Faye飛 （ 詹雯婷 ） 、 吉他手Real阿沁 （ 黃漢青 ...,"[[7208, None, None, None]]","[F.I.R.的團員, F.I.R., 團員, 主唱Faye飛（詹雯婷）、吉他手Real阿沁..."
8931,NOT ENOUGH INFO,香港國際機場全年24小時運作，它從2001年起一直躋身世界最佳機場 ， 並8度獲評級爲全宇宙...,"[[8162, None, None, None]]","[香港國際機場, 24小時, 小時, 它, 2001年, 世界最佳機場, 世界, 機場, 全..."
332,NOT ENOUGH INFO,北理工是歷史上最後一批副部級高校，黨委書記和校長列入中央管理的高校 ， 簡稱中管高校 ， 俗...,"[[204, None, None, None]]","[北理工, 歷史上最後一批副部級高校，黨委書記和校長列入中央管理的高校, 歷史, 副部級高校..."
2177,refutes,南京大學附屬中學，從中國江蘇省遷移。,"[[[2654, 2877, 南大附中, 2]]]","[南京大學附屬中學, 南京大學, 中學, 中國江蘇省]"
7551,supports,毒魚豆的萃取物被西印度群島的原住民發掘可以導致魚麻醉安靜 ， 讓他們得以趁機徒手抓魚 。,"[[[6825, 6546, 毒魚豆, 4]]]","[毒魚豆的萃取物, 毒魚豆, 萃取物, 西印度羣島的原住民, 西印度羣島, 西, 印度羣島,..."
4688,supports,軟件開發是一項包括需求獲取 、 開發規劃 、 需求分析和設計 、 編程實現 、 軟件測試 、...,"[[[4631, 4702, 軟件開發, 1], [4631, 4702, 軟件開發, 2]]]","[軟件開發, 一項包括需求獲取、開發規劃、需求分析和設計、編程實現、軟件測試、版本控制的系統..."
8496,supports,國立臺灣大學應用力學研究所從1984年開始招收碩 、 博士班研究生 ， 首任所長爲理論及應用...,"[[[7667, 7249, 國立臺灣大學應用力學研究所, 3]]]","[國立臺灣大學應用力學研究所, 國立臺灣大學, 臺灣大學, 應用力學, 力學, 研究所, 1..."
873,supports,威廉·倫琴拒絕定名新電子波爲倫琴射線，堅持稱作X射線。,"[[[599, 594, 威廉·倫琴, 4]]]","[威廉·倫琴, 新電子波, 電子波, 倫琴射線, X射線]"


## Get pages via wiki online api and google search api

### Wiki API

In [15]:
from tqdm import tqdm
import time
predicted_results = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
  res = get_pred_pages(row)
  predicted_results.append(res)
  time.sleep(0.1)

predicted_results = pd.Series(predicted_results)
display(predicted_results)

save_doc(TRAIN_DATA, predicted_results, mode=train_data_name)
train_df.loc[:, 'predicted_pages'] = predicted_results.tolist()

100%|██████████| 10/10 [00:49<00:00,  4.95s/it]


0    {天王星, 天衛三, 磁層, 緹坦妮雅, 夢, 仲夏夜_(羅文專輯), 作者, 仲夏夜之夢_...
1                 {牠, 北冰洋, 南太平洋, 鳥, 信天翁科, 太平洋, 翼展, 世界}
2    {吉他, Faye_Disc, 陳建寧, 主唱, F._R._大衛, 阿沁, 鍵盤手, 詹雯...
3      {香港國際機場, 機場, 2001年, 24小時, 小時, 宇宙, 24_(電視劇), 世界}
4    {學校, 校長, 中央部屬高校, 中華人民共和國, 歷史, 高等學校, 黨委書記和校長列入中...
5                     {南京大學附屬中學, 南京大學, 江蘇省_(中華民國), 中學}
6                             {毒魚豆, 印度, 魚, 原住民, 西, 萃取}
7                     {軟件開發, 版本控制, 軟件, 軟件測試, 活動, 系統工程}
8    {研究所, 美國國家工程院, 理論, 研究生, 院士, 國立臺灣大學, 國立臺灣大學應用力學...
9    {電子, 威廉·亨利·布拉格, X射線, 威廉·倫琴, 倫琴_(單位), 威廉·維恩, 威廉...
dtype: object

### Google Search

In [16]:
all_claim_query = []
claim_sent = train_df['claim'].str.replace('。', '，').str.split('，')
for qlist, nps in zip(claim_sent, train_df["hanlp_results"]):
  tmp2 = []
  for q in qlist:
    if len(q.strip()) > 0:
      tmp2.append(q)

  # tmp2.extend(nps) # 是否把NP也丟到google搜尋
  all_claim_query.append(tmp2)

print(pd.Series(all_claim_query))

0                 [天衛三軌道在天王星內部的磁層, 以《 仲夏夜之夢 》作者緹坦妮雅命名]
1    [信天翁科的活動範圍位於北冰洋以及南太平洋, 牠的翼展可達到3.7米, 是世界上現存的翼展最...
2    [F.I.R. 的 團員有主唱Faye飛 （ 詹雯婷 ） 、 吉他手Real阿沁 （ 黃漢青...
3    [香港國際機場全年24小時運作, 它從2001年起一直躋身世界最佳機場 ,  並8度獲評級爲...
4    [北理工是歷史上最後一批副部級高校, 黨委書記和校長列入中央管理的高校 ,  簡稱中管高校 ...
5                                 [南京大學附屬中學, 從中國江蘇省遷移]
6       [毒魚豆的萃取物被西印度群島的原住民發掘可以導致魚麻醉安靜 ,  讓他們得以趁機徒手抓魚 ]
7    [軟件開發是一項包括需求獲取 、 開發規劃 、 需求分析和設計 、 編程實現 、 軟件測試 ...
8    [國立臺灣大學應用力學研究所從1984年開始招收碩 、 博士班研究生 ,  首任所長爲理論及...
9                        [威廉·倫琴拒絕定名新電子波爲倫琴射線, 堅持稱作X射線]
dtype: object


In [17]:
google_search_pages = []
all_pred_doc = []
for i, qlist in enumerate(tqdm(all_claim_query)):
  pages = google_search(qlist)
  google_search_pages.append(pages)

  all_pred = pages.copy()
  claim_id = TRAIN_DATA[i]['id']
  try:    
    wiki_pages = train_df.loc[claim_id]['predicted_pages']
    all_pred.extend(wiki_pages)
  except Exception as e:
    print('cannot get the wiki page result of the claim_id')

  all_pred_doc.append(all_pred)
    
  if i%20==0 and i!=0:
    print()
    macro_doc_recall(TRAIN_DATA, google_search_pages)
    macro_doc_recall(TRAIN_DATA, all_pred_doc)

100%|██████████| 10/10 [02:04<00:00, 12.45s/it]


In [18]:
train_df = train_df.reset_index()
train_df['google_search_pages'] = pd.Series(google_search_pages)
train_df['predicted_pages'] = train_df['predicted_pages'].map(list)
train_df.to_pickle(f'./data/{train_data_name}_doc10+google.pkl')

重複上面的步驟改用 public_train_0522

In [19]:
%%time
train_data_name = 'public_train_0522'
TRAIN_DATA = load_json(f"data/{train_data_name}.jsonl")
hanlp_file = f"data/hanlp_con_results_{train_data_name}.pkl"

if Path(hanlp_file).exists():
    with open(hanlp_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    predictor = (hanlp.pipeline().append(
        hanlp.load("FINE_ELECTRA_SMALL_ZH"),
        output_key="tok",
    ).append(
        hanlp.load("CTB9_CON_ELECTRA_SMALL"),
        output_key="con",
        input_key="tok",
    ))
    
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TRAIN_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)



CPU times: user 1.07 s, sys: 95.2 ms, total: 1.17 s
Wall time: 1.08 s


In [20]:
train_doc_path = f"./data/{train_data_name}_doc10.pkl"

train_df = pd.DataFrame(TRAIN_DATA)
train_df.loc[:, 'hanlp_results'] = hanlp_results

train_df = train_df.set_index('id')
train_df

Unnamed: 0_level_0,label,claim,evidence,hanlp_results
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14301,NOT ENOUGH INFO,崔維斯·米勒是退役美國職業棒球大聯盟投手，目前于飛機擔任運動指導員。,"[[13414, None, None, None]]","[崔維斯·米勒, 退役美國職業棒球大聯盟投手, 美國, 職業棒球, 棒球, 聯盟, 投手, ..."
2469,supports,氣象衛星是一種人造衛星，可以呈現地球天氣的變化和大氣特徵。,"[[[2053, 2226, 氣象衛星, 0]]]","[氣象衛星, 一種人造衛星, 衛星, 地球天氣的變化和大氣特徵, 地球天氣的變化, 地球天氣..."
7137,supports,在臺灣臺南市東區虎尾寮重劃區裏面的臺南市東區復興國民小學，旁邊有復興國民中學。,"[[[6236, 6059, 臺南市東區復興國民小學, 0]]]","[在臺灣臺南市東區虎尾寮重劃區裏面的臺南市東區復興國民小學, 臺灣台南市東區虎尾寮重劃區裏面..."
13711,supports,郭杲是宋朝的政治人物，曾逼光宗退位並扶持光宗次子趙擴繼位，即爲宋寧宗。,"[[[13103, 11342, 郭杲, 6]]]","[郭杲, 宋朝的政治人物, 宋朝, 政治人物, 光宗, 光宗次子趙擴, 光宗次子, 光宗, ..."
6625,supports,英國皇家郵輪泰坦尼克號於1912年展開首航，目的地爲紐約。,"[[[13447, 11585, 泰坦尼克號, 0], [13447, 11585, 泰坦尼...","[英國皇家郵輪泰坦尼克號, 英國皇家郵輪, 英國, 皇家郵輪, 泰坦尼克號, 1912年, ..."
12058,NOT ENOUGH INFO,中日之間八年戰爭的開始使得1937年臺灣進入戰時體制。,"[[11155, None, None, None]]","[中日之間八年戰爭的開始, 中日, 八年戰爭, 戰爭, 開始, 1937年臺灣, 1937年..."
10981,supports,王莽所建立的新朝被擁立更始帝的綠林軍所亡，新朝開創了中國歷史上透過篡位取得政權的先例。,"[[[9952, 9050, 新朝, 0], [9952, 9050, 新朝, 12], [...","[王莽所建立的新朝, 王莽, 朝, 擁立更始帝的綠林軍, 更始帝, 更始, 始, 帝, 綠林..."
10965,refutes,阿登多夫在2018年的人口數據中爲女生多於男生。,"[[[10299, 9314, 阿登多夫, 1]]]","[阿登多夫, 2018年的人口數據, 2018年, 人口數據, 女生, 男生]"
3550,supports,流產是指胚胎或胎兒發育到之前的自然死亡 。,"[[[12651, 10987, 流產, 1]]]","[流產, 胚胎或胎兒, 死亡]"
5578,refutes,被軟禁的張學良跟隨著國民政府遷臺，並直到蔣經國逝世後才由李登輝釋放，後來留學至美國。,"[[[4582, 4664, 張學良, 7]]]","[被軟禁的張學良, 張學良, 國民政府, 臺, 蔣經國, 李登輝, 美國]"


## Get pages via wiki online api and google search api

### Wiki API

In [21]:
from tqdm import tqdm
import time
predicted_results = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
  res = get_pred_pages(row)
  predicted_results.append(res)
  time.sleep(0.1)

predicted_results = pd.Series(predicted_results)
display(predicted_results)

save_doc(TRAIN_DATA, predicted_results, mode=train_data_name)
train_df.loc[:, 'predicted_pages'] = predicted_results.tolist()

100%|██████████| 10/10 [00:35<00:00,  3.57s/it]


0    {馬克思·米勒, 聯盟, 崔維斯·米勒, 埃茲拉·米勒, 投手, 美國職業棒球大聯盟, 棒球...
1                             {氣象衛星, 地球, 衛星, 人造衛星, 天氣}
2    {臺灣, 虎尾寮_(臺南市), 東區_(臺南市), 臺南市_(州轄市), 臺南市東區復興國民小學}
3                              {郭杲, 政治人物, 光宗, 宋朝, 宋寧宗}
4             {紐約, 泰坦尼克號_(消歧義), 郵輪, 1912年, 英國, 英國皇家郵輪}
5                         {臺灣, 1937年臺灣, 1937年, 戰爭, 中日}
6      {王莽, 中國歷史, 帝, 更始帝, 朝, 綠林軍, 中國, 歷史, 新朝, 更始_(漢朝)}
7                                        {阿登多夫, 2018年}
8                                         {流產, 胎兒, 死亡}
9                        {國民政府, 蔣經國, 軟禁, 李登輝, 張學良, 美國}
dtype: object

### Google Search

In [22]:
all_claim_query = []
claim_sent = train_df['claim'].str.replace('。', '，').str.split('，')
for qlist, nps in zip(claim_sent, train_df["hanlp_results"]):
  tmp2 = []
  for q in qlist:
    if len(q.strip()) > 0:
      tmp2.append(q)

  # tmp2.extend(nps) # 是否把NP也丟到google搜尋
  all_claim_query.append(tmp2)

print(pd.Series(all_claim_query))

0             [崔維斯·米勒是退役美國職業棒球大聯盟投手, 目前于飛機擔任運動指導員]
1                  [氣象衛星是一種人造衛星, 可以呈現地球天氣的變化和大氣特徵]
2        [在臺灣臺南市東區虎尾寮重劃區裏面的臺南市東區復興國民小學, 旁邊有復興國民中學]
3           [郭杲是宋朝的政治人物, 曾逼光宗退位並扶持光宗次子趙擴繼位, 即爲宋寧宗]
4                  [英國皇家郵輪泰坦尼克號於1912年展開首航, 目的地爲紐約]
5                     [中日之間八年戰爭的開始使得1937年臺灣進入戰時體制]
6    [王莽所建立的新朝被擁立更始帝的綠林軍所亡, 新朝開創了中國歷史上透過篡位取得政權的先例]
7                        [阿登多夫在2018年的人口數據中爲女生多於男生]
8                           [流產是指胚胎或胎兒發育到之前的自然死亡 ]
9    [被軟禁的張學良跟隨著國民政府遷臺, 並直到蔣經國逝世後才由李登輝釋放, 後來留學至美國]
dtype: object


In [23]:
train_df = pd.DataFrame(TRAIN_DATA)
google_search_pages = []
all_pred_doc = []
for i, qlist in enumerate(tqdm(all_claim_query)):
  pages = google_search(qlist)
  google_search_pages.append(pages)

  all_pred = pages.copy()
  claim_id = TRAIN_DATA[i]['id']
  try:    
    wiki_pages = train_df.loc[claim_id]['predicted_pages']
    all_pred.extend(wiki_pages)
  except Exception as e:
    print('cannot get the wiki page result of the claim_id')

  all_pred_doc.append(all_pred)
    
  if i%20==0 and i!=0:
    print()
    macro_doc_recall(TRAIN_DATA, google_search_pages)
    macro_doc_recall(TRAIN_DATA, all_pred_doc)

 10%|█         | 1/10 [00:10<01:31, 10.22s/it]

cannot get the wiki page result of the claim_id


 20%|██        | 2/10 [00:20<01:21, 10.13s/it]

cannot get the wiki page result of the claim_id


 30%|███       | 3/10 [00:32<01:16, 10.91s/it]

cannot get the wiki page result of the claim_id


 40%|████      | 4/10 [00:44<01:09, 11.54s/it]

cannot get the wiki page result of the claim_id


 50%|█████     | 5/10 [00:52<00:50, 10.11s/it]

cannot get the wiki page result of the claim_id


 60%|██████    | 6/10 [00:56<00:32,  8.20s/it]

cannot get the wiki page result of the claim_id


 70%|███████   | 7/10 [01:06<00:26,  8.73s/it]

cannot get the wiki page result of the claim_id


 80%|████████  | 8/10 [01:11<00:14,  7.38s/it]

cannot get the wiki page result of the claim_id


 90%|█████████ | 9/10 [01:15<00:06,  6.46s/it]

cannot get the wiki page result of the claim_id


100%|██████████| 10/10 [01:25<00:00,  8.58s/it]

cannot get the wiki page result of the claim_id





In [24]:
train_df = train_df.reset_index()
train_df['google_search_pages'] = pd.Series(google_search_pages)
train_df['predicted_pages'] = train_df['predicted_pages'].map(list)
train_df.to_pickle(f'./data/{train_data_name}_doc10+google.pkl')

### Step 2. Calculate our results

In [25]:
calculate_precision(TRAIN_DATA, predicted_results)
calculate_recall(TRAIN_DATA, predicted_results)

Precision: 0.21250000000000002
Recall: 0.875


### Step 3. Repeat the same process on test set
Create parsing tree

In [26]:
test_data_name = 'public_test'
TEST_DATA = load_json(f"data/{test_data_name}.jsonl")

hanlp_test_file = f"data/hanlp_con_test_results.pkl"
if Path(hanlp_test_file).exists():
    with open(hanlp_test_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    predictor = (hanlp.pipeline().append(
        hanlp.load("FINE_ELECTRA_SMALL_ZH"),
        output_key="tok",
    ).append(
        hanlp.load("CTB9_CON_ELECTRA_SMALL"),
        output_key="con",
        input_key="tok",
    ))
    
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TEST_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)



Get pages via wiki online api

In [27]:
test_doc_path = f"data/{test_data_name}_doc{top_n_doc}.jsonl"
if Path(test_doc_path).exists():
    with open(test_doc_path, "r", encoding="utf8") as f:
        test_results = pd.Series(
            [set(json.loads(line)["predicted_pages"]) for line in f])
else:
    test_df = pd.DataFrame(TEST_DATA)
    test_df.loc[:, "hanlp_results"] = hanlp_results

    from tqdm import tqdm
    import time
    predicted_results = []
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        res = get_pred_pages(row)
        predicted_results.append(res)
        time.sleep(0.1)

    predicted_results = pd.Series(predicted_results)
    save_doc(TEST_DATA, predicted_results, mode=test_data_name)
    test_df.loc[:, 'predicted_pages'] = predicted_results

100%|██████████| 10/10 [00:33<00:00,  3.38s/it]


Get test data pages from google search

In [28]:
all_claim_query = []
claim_sent = test_df['claim'].str.replace('。', '，').str.split('，')
for qlist, nps in zip(claim_sent, test_df["hanlp_results"]):
  tmp2 = []
  for q in qlist:
    if len(q.strip()) > 0:
      tmp2.append(q)

  # tmp2.extend(nps) # 是否把NP也丟到google搜尋
  all_claim_query.append(tmp2)

print(pd.Series(all_claim_query))

0                 [光學顯微鏡是以電磁學原理來將不可見或難見的微小物放大至肉眼可見的儀器]
1                             [產絲的蠶或產蜜的蜜蜂爲提供間接經濟利益的昆蟲]
2                             [波蘭西部的綠山城縣平均每平方公里的土地有0人]
3                            [Vivien Leigh主演魂斷藍橋中的女配角]
4                         [侯孝賢改編自唐代文言文學的電影獲得金馬獎最佳劇情片獎]
5                        [國務院前副總理的姪子薄熙來在2012年9月被開除黨籍 ]
6                                           [水星凌日曾發生過]
7                       [馬克思在自己的作品中論述了馬克思主義政治經濟學的基本概念]
8                  [一貫道相信最高神祇無生老母派遣轉世成銀公祖師路中一的彌勒佛拯救凡間]
9    [回族世居內蒙古至山西 、 陝西 、 甘肅 ,  以至於新疆和中亞一帶, 受中亞與西亞中伊斯...
dtype: object


In [29]:
google_search_pages = []  

for i, qlist in enumerate(tqdm(all_claim_query)):
  pages = google_search(qlist)
  google_search_pages.append(pages)

test_df['google_search_pages'] = pd.Series(google_search_pages)
test_df['predicted_pages'] = test_df['predicted_pages'].map(list)

if test_data_name == 'public_test':
  test_df.to_pickle(f"./data/test_doc10+google.pkl")
elif test_data_name == 'private_test_data':
  test_df.to_pickle(f"./data/private_test_data_doc10+google.pkl")

100%|██████████| 10/10 [01:07<00:00,  6.75s/it]


重複上面的步驟 改用 private_test_data

In [30]:
test_data_name = 'private_test_data'
TEST_DATA = load_json(f"data/{test_data_name}.jsonl")

hanlp_test_file = f"data/hanlp_con_test_results.pkl"
if Path(hanlp_test_file).exists():
    with open(hanlp_test_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    predictor = (hanlp.pipeline().append(
        hanlp.load("FINE_ELECTRA_SMALL_ZH"),
        output_key="tok",
    ).append(
        hanlp.load("CTB9_CON_ELECTRA_SMALL"),
        output_key="con",
        input_key="tok",
    ))
    
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TEST_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)



Get pages via wiki online api

In [31]:
test_doc_path = f"data/{test_data_name}_doc{top_n_doc}.jsonl"
if Path(test_doc_path).exists():
    with open(test_doc_path, "r", encoding="utf8") as f:
        test_results = pd.Series(
            [set(json.loads(line)["predicted_pages"]) for line in f])
else:
    test_df = pd.DataFrame(TEST_DATA)
    test_df.loc[:, "hanlp_results"] = hanlp_results

    from tqdm import tqdm
    import time
    predicted_results = []
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        res = get_pred_pages(row)
        predicted_results.append(res)
        time.sleep(0.1)

    predicted_results = pd.Series(predicted_results)
    save_doc(TEST_DATA, predicted_results, mode=test_data_name)
    test_df.loc[:, 'predicted_pages'] = predicted_results

100%|██████████| 10/10 [00:39<00:00,  3.99s/it]


Get test data pages from google search

In [32]:
all_claim_query = []
claim_sent = test_df['claim'].str.replace('。', '，').str.split('，')
for qlist, nps in zip(claim_sent, test_df["hanlp_results"]):
  tmp2 = []
  for q in qlist:
    if len(q.strip()) > 0:
      tmp2.append(q)

  # tmp2.extend(nps) # 是否把NP也丟到google搜尋
  all_claim_query.append(tmp2)

print(pd.Series(all_claim_query))

0    [雞形目的鳥腿腳強健, 擅長在地面奔跑, 其中有珍稀物種, 體態雄健優美、顏色鮮豔；也有經濟...
1      [教會剛建立時為解決內部的一些問題, 使徒們寫下許多便條, 其中有八卷不是保羅寫的為大公書信]
2                                  [羅伯·昆蘭於明尼蘇達州聖保羅市出生]
3                     [2015年美國網球公開賽女子單打比賽裡小威廉絲是上一屆的冠軍]
4                         [南陽郡的地方豪族出身的漢光武帝劉秀為太祖劉邦的九世孫]
5                             [李昂是浙江杭州府仁和縣人, 為明朝的政治人物]
6                     [上承南北朝、下啟唐朝的隋朝由楊廣建立, 而楊堅開創出開皇之治]
7                   [香港電影導演會年度大獎最佳導演獎創立於2004年, 設有三個名額]
8                   [劉宋光祿大夫張茂度的曾孫是張嵊, 一位南北朝時從秘書郎起家的官員]
9             [四川人封從德是六四天安門事件的學生運動領袖, 現在已與妻子離婚, 居住於美國]
dtype: object


In [33]:
google_search_pages = []  

for i, qlist in enumerate(tqdm(all_claim_query)):
  pages = google_search(qlist)
  google_search_pages.append(pages)

test_df['google_search_pages'] = pd.Series(google_search_pages)
test_df['predicted_pages'] = test_df['predicted_pages'].map(list)

if test_data_name == 'public_test':
  test_df.to_pickle(f"./data/test_doc10+google.pkl")
elif test_data_name == 'private_test_data':
  test_df.to_pickle(f"./data/private_test_data_doc10+google.pkl")

100%|██████████| 10/10 [01:35<00:00,  9.53s/it]


notebook2
# PART 2. Sentence retrieval

Import some libs

In [9]:
# built-in libs
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union

# third-party libs
import numpy as np
import pandas as pd
import json
from pandarallel import pandarallel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset, Dataset

# local libs
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=10)

Global variable

In [10]:
top_n_doc = 10

In [11]:
SEED = 42

TRAIN_DATA = load_json("data/public_train.jsonl")
TRAIN_DATA.extend(load_json("data/public_train_0522.jsonl"))

DOC_DATA = load_json(f"data/public_train_doc10.jsonl")
DOC_DATA.extend(load_json("data/public_train_0522_doc10.jsonl"))

LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

_y = [LABEL2ID[data["label"]] for data in TRAIN_DATA]

assert(len(DOC_DATA) == len(TRAIN_DATA))

add google search result

In [12]:
google_df = pd.read_pickle('./data/public_train_doc10+google.pkl')
df_0522_google = pd.read_pickle('./data/public_train_0522_doc10+google.pkl')

google_df = pd.concat((google_df, df_0522_google))
google_df['google_search_pages'] = \
    google_df['google_search_pages'].apply(lambda x:list(map(lambda y:y.split('?')[0], x)))
google_df = google_df.set_index('id')
# google_df

In [13]:
for instance in DOC_DATA:
    cid = instance['id']
    if cid in google_df.index:
        g_pages = google_df.loc[cid]['google_search_pages']
        instance['predicted_pages'].extend(g_pages)        
        instance['predicted_pages'] = list(set(instance['predicted_pages']))
        
# 檢查有無重複的 pages
for instance in DOC_DATA:
    if len(set(instance['predicted_pages'])) < len(instance['predicted_pages']):
        print(instance)

In [14]:
DOC_DATA = pd.DataFrame(DOC_DATA)
DOC_DATA = DOC_DATA[DOC_DATA['id'].apply(lambda x:type(x)==int)] # 去除chatgpt生成的負樣本

train_id = pd.DataFrame(TRAIN_DATA)['id'].values
DOC_DATA = DOC_DATA[DOC_DATA['id'].isin(train_id)]
DOC_DATA
DOC_DATA = DOC_DATA.to_dict(orient='records')
len(DOC_DATA)

11620

In [15]:
# GT means Ground Truth
TRAIN_GT, DEV_GT = train_test_split(
    DOC_DATA,
    test_size=0.05,
    random_state=SEED,
    shuffle=True,
    stratify=_y,
)
print(len(DEV_GT))

581


In [16]:
# public test
test_data_public = pd.read_pickle(f"./data/test_doc10+google.pkl")
test_data_public['google_search_pages'] = \
    test_data_public['google_search_pages'].apply(lambda x:list(map(lambda y:y.split('?')[0], x)))

for i, instance in test_data_public.iterrows():
    cid = instance['id']
    instance['predicted_pages'].extend(instance['google_search_pages'])
    # 這邊改不會去重複
#     instance['predicted_pages'] = list(set(instance['predicted_pages'])) 

test_data_public['predicted_pages'] = test_data_public['predicted_pages'].apply(set).apply(list)

# private test
test_data_private = pd.read_pickle(f"./data/private_test_data_doc10+google.pkl")
test_data_private['google_search_pages'] = \
    test_data_private['google_search_pages'].apply(lambda x:list(map(lambda y:y.split('?')[0], x)))

for i, instance in test_data_private.iterrows():
    cid = instance['id']
    instance['predicted_pages'].extend(instance['google_search_pages'])
    # 這邊改不會去重複
#     instance['predicted_pages'] = list(set(instance['predicted_pages'])) 

test_data_private['predicted_pages'] = test_data_private['predicted_pages'].apply(set).apply(list)

In [17]:
test_data = pd.concat((test_data_public, test_data_private))
test_data = test_data.reset_index(drop=True)
test_data.shape

(9038, 5)

Preload wiki database (1 min)

In [18]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages)
del wiki_pages

Reading and concatenating jsonl files in data/wiki-pages
Generate parse mapping
Transform to id to evidence_map mapping


### Helper function

Calculate precision for sentence retrieval

In [19]:
def evidence_macro_precision(
    instance: Dict,
    top_rows: pd.DataFrame,
) -> Tuple[float, float]:
    """Calculate precision for sentence retrieval
    This function is modified from fever-scorer.
    https://github.com/sheffieldnlp/fever-scorer/blob/master/src/fever/scorer.py

    Args:
        instance (dict): a row of the dev set (dev.jsonl) of test set (test.jsonl)
        top_rows (pd.DataFrame): our predictions with the top probabilities

        IMPORTANT!!!
        instance (dict) should have the key of `evidence`.
        top_rows (pd.DataFrame) should have a column `predicted_evidence`.

    Returns:
        Tuple[float, float]:
        [1]: relevant and retrieved (numerator of precision)
        [2]: retrieved (denominator of precision)
    """
    this_precision = 0.0
    this_precision_hits = 0.0

    # Return 0, 0 if label is not enough info since not enough info does not
    # contain any evidence.
    if instance["label"].upper() != "NOT ENOUGH INFO":
        # e[2] is the page title, e[3] is the sentence index
        all_evi = [[e[2], e[3]]
                   for eg in instance["evidence"]
                   for e in eg
                   if e[3] is not None]
        claim = instance["claim"]
        predicted_evidence = top_rows[top_rows["claim"] ==
                                      claim]["predicted_evidence"].tolist()

        for prediction in predicted_evidence:
            if prediction in all_evi:
                this_precision += 1.0
            this_precision_hits += 1.0

        return (this_precision /
                this_precision_hits) if this_precision_hits > 0 else 1.0, 1.0

    return 0.0, 0.0

Calculate recall for sentence retrieval

In [20]:
def evidence_macro_recall(
    instance: Dict,
    top_rows: pd.DataFrame,
) -> Tuple[float, float]:
    """Calculate recall for sentence retrieval
    This function is modified from fever-scorer.
    https://github.com/sheffieldnlp/fever-scorer/blob/master/src/fever/scorer.py

    Args:
        instance (dict): a row of the dev set (dev.jsonl) of test set (test.jsonl)
        top_rows (pd.DataFrame): our predictions with the top probabilities

        IMPORTANT!!!
        instance (dict) should have the key of `evidence`.
        top_rows (pd.DataFrame) should have a column `predicted_evidence`.

    Returns:
        Tuple[float, float]:
        [1]: relevant and retrieved (numerator of recall)
        [2]: relevant (denominator of recall)
    """
    # We only want to score F1/Precision/Recall of recalled evidence for NEI claims
    if instance["label"].upper() != "NOT ENOUGH INFO":
        # If there's no evidence to predict, return 1
        if len(instance["evidence"]) == 0 or all(
            [len(eg) == 0 for eg in instance]):
            return 1.0, 1.0

        claim = instance["claim"]

        predicted_evidence = top_rows[top_rows["claim"] ==
                                      claim]["predicted_evidence"].tolist()

        for evidence_group in instance["evidence"]:
            evidence = [[e[2], e[3]] for e in evidence_group]
            if all([item in predicted_evidence for item in evidence]):
                # We only want to score complete groups of evidence. Incomplete
                # groups are worthless.
                return 1.0, 1.0
        return 0.0, 1.0
    return 0.0, 0.0

Calculate the scores of sentence retrieval

In [21]:
def evaluate_retrieval(
    probs: np.ndarray,
    df_evidences: pd.DataFrame,
    ground_truths: pd.DataFrame,
    top_n: int = 5,
    cal_scores: bool = True,
    save_name: str = None,
) -> Dict[str, float]:
    """Calculate the scores of sentence retrieval

    Args:
        probs (np.ndarray): probabilities of the candidate retrieved sentences
        df_evidences (pd.DataFrame): the candiate evidence sentences paired with claims
        ground_truths (pd.DataFrame): the loaded data of dev.jsonl or test.jsonl
        top_n (int, optional): the number of the retrieved sentences. Defaults to 2.

    Returns:
        Dict[str, float]: F1 score, precision, and recall
    """
    df_evidences["prob"] = probs
    top_rows = (
        df_evidences.groupby("id").apply(
        lambda x: x.nlargest(top_n, "prob"))
        .reset_index(drop=True)
    )

    if cal_scores:
        macro_precision = 0
        macro_precision_hits = 0
        macro_recall = 0
        macro_recall_hits = 0

        for i, instance in enumerate(ground_truths):
            macro_prec = evidence_macro_precision(instance, top_rows)
            macro_precision += macro_prec[0]
            macro_precision_hits += macro_prec[1]

            macro_rec = evidence_macro_recall(instance, top_rows)
            macro_recall += macro_rec[0]
            macro_recall_hits += macro_rec[1]

        pr = (macro_precision /
              macro_precision_hits) if macro_precision_hits > 0 else 1.0
        rec = (macro_recall /
               macro_recall_hits) if macro_recall_hits > 0 else 0.0
        f1 = 2.0 * pr * rec / (pr + rec)

    if save_name is not None:
        # write doc7_sent5 file
        with open(f"data/{save_name}", "w", encoding='utf8') as f:
            for instance in ground_truths:
                claim = instance["claim"]
                cid = instance["id"]
                predicted_evidence = top_rows[
                    top_rows["id"] == cid]["predicted_evidence"].tolist()
                instance["predicted_evidence"] = predicted_evidence
                f.write(json.dumps(instance, ensure_ascii=False) + "\n")

    if cal_scores:
        return {"F1 score": f1, "Precision": pr, "Recall": rec}

Inference script to get probabilites for the candidate evidence sentences

In [22]:
def get_predicted_probs(
    model: nn.Module,
    dataloader: Dataset,
    device: torch.device,
) -> np.ndarray:
    """Inference script to get probabilites for the candidate evidence sentences

    Args:
        model: the one from HuggingFace Transformers
        dataloader: devset or testset in torch dataloader

    Returns:
        np.ndarray: probabilites of the candidate evidence sentences
    """
    model.eval()
    probs = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            probs.extend(torch.softmax(logits, dim=1)[:, 1].tolist())

    return np.array(probs)

AicupTopkEvidenceBERTDataset class for AICUP dataset with top-k evidence sentences

In [23]:
class SentRetrievalBERTDataset(BERTDataset):
    """AicupTopkEvidenceBERTDataset class for AICUP dataset with top-k evidence sentences."""

    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        sentA = item["claim"]
        sentB = item["text"]

        # claim [SEP] text
        concat = self.tokenizer(
            sentA,
            sentB,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
        )
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}
        if "label" in item:
            concat_ten["labels"] = torch.tensor(item["label"])

        return concat_ten

### Main function for sentence retrieval

In [24]:
def pair_with_wiki_sentences(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    negative_ratio: float,
) -> pd.DataFrame:
    """Only for creating train sentences."""
    claims = []
    sentences = []
    labels = []

    # positive
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue

        claim = df["claim"].iloc[i]
        evidence_sets = df["evidence"].iloc[i]
        for evidence_set in evidence_sets:
            sents = []
            for evidence in evidence_set:
                # evidence[2] is the page title
                page = evidence[2].replace(" ", "_")
                # the only page with weird name
                if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                    continue
                # evidence[3] is in form of int however, mapping requires str
                sent_idx = str(evidence[3])
                sents.append(mapping[page][sent_idx])

            whole_evidence = " ".join(sents)

            claims.append(claim)
            sentences.append(whole_evidence)
            labels.append(1)

    # negative
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue
        claim = df["claim"].iloc[i]

        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in df["evidence"][i]
                            for evidence in evidences])
        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [
                    (page, sent_idx) for sent_idx in mapping[page].keys()
                ]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for pair in page_sent_id_pairs:
                if pair in evidence_set:
                    continue
                text = mapping[page][pair[1]]
                # `np.random.rand(1) <= 0.05`: Control not to add too many negative samples
                if text != "" and np.random.rand(1) <= negative_ratio:
                    claims.append(claim)
                    sentences.append(text)
                    labels.append(0)

    return pd.DataFrame({"claim": claims, "text": sentences, "label": labels})


def pair_with_wiki_sentences_eval(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    is_testset: bool = False,
) -> pd.DataFrame:
    """Only for creating dev and test sentences."""
    ids = []
    claims = []
    sentences = []
    evidence = []
    predicted_evidence = []

    # negative
    for i in range(len(df)):
        # if df["label"].iloc[i] == "NOT ENOUGH INFO":
        #     continue
        cid = df["id"].iloc[i]
        claim = df["claim"].iloc[i]        

        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [(page, k) for k in mapping[page]]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for page_name, sentence_id in page_sent_id_pairs:
                text = mapping[page][sentence_id]
                if text != "":
                    ids.append(cid)
                    claims.append(claim)
                    sentences.append(text)
                    if not is_testset:
                        evidence.append(df["evidence"].iloc[i])
                    predicted_evidence.append([page_name, int(sentence_id)])

    return pd.DataFrame({
        "id": ids,
        "claim": claims,
        "text": sentences,
        "evidence": evidence if not is_testset else None,
        "predicted_evidence": predicted_evidence,
    })

### Step 1. Setup training environment

Hyperparams

In [25]:
#@title  { display-mode: "form" }

# MODEL_NAME = "bert-base-chinese"  #@param {type:"string"}

MODEL_NAME = "ckiplab/bert-base-chinese"  #@param {type:"string"}
# MODEL_NAME = "ckiplab/albert-tiny-chinese"  #@param {type:"string"}

NUM_EPOCHS = 5  #@param {type:"integer"}
LR = 5e-6  #@param {type:"number"}
TRAIN_BATCH_SIZE = 128  #@param {type:"integer"}
TEST_BATCH_SIZE = 256  #@param {type:"integer"}
NEGATIVE_RATIO = 1  #@param {type:"number"}
VALIDATION_STEP = 100  #@param {type:"integer"}
TOP_N = 100  #@param {type:"integer"}

Experiment Directory

In [26]:
EXP_DIR = f"sent_retrieval/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_neg{NEGATIVE_RATIO}_top{TOP_N}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

### Step 2. Combine claims and evidences

In [27]:
train_df = pair_with_wiki_sentences(
    mapping,
    pd.DataFrame(TRAIN_GT),
    NEGATIVE_RATIO,
)
counts = train_df["label"].value_counts()
print("Now using the following train data with 0 (Negative) and 1 (Positive)")
print(counts)

dev_evidences = pair_with_wiki_sentences_eval(mapping, pd.DataFrame(DEV_GT))

Now using the following train data with 0 (Negative) and 1 (Positive)
label
0    947038
1      8351
Name: count, dtype: int64


In [28]:
# 计算类别权重
def compute_class_weights(labels):
    class_samples = torch.bincount(labels)
    total_samples = len(labels)
    class_weights = total_samples / (len(class_samples) * class_samples)
    return class_weights

# 创建样本标签和权重张量
tmp = torch.Tensor(train_df["label"]).to(torch.int8)  # 样本标签
class_weights = compute_class_weights(tmp)  # 类别权重

### Step 3. Start training

Dataloader things

In [29]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = SentRetrievalBERTDataset(train_df, tokenizer=tokenizer, max_length=312)
val_dataset = SentRetrievalBERTDataset(dev_evidences, tokenizer=tokenizer, max_length=312)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

Trainer

In [30]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=1e-3)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

# 定义损失函数
ce_loss = nn.CrossEntropyLoss(weight=class_weights.to(device))

writer = SummaryWriter(LOG_DIR)

Some weights of the model checkpoint at ckiplab/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-c

Please make sure that you are using gpu when training (5 min)

In [40]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0
pred_count = None

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        loss = ce_loss(outputs.logits, batch['labels'])

        # loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_true = batch["labels"].tolist()

        current_steps += 1

        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            print("Start validation")
            probs = get_predicted_probs(model, eval_dataloader, device)

            val_results = evaluate_retrieval(
                probs=probs,
                df_evidences=dev_evidences,
                ground_truths=DEV_GT,
                top_n=TOP_N,
            )
            
            val_results['tls'] = loss.item()
                
            print(current_steps)
            print(val_results)
            
            if val_results['Recall'] >= 0.8:
                save_checkpoint(model, CKPT_DIR, current_steps)

#         if current_steps % 100 == 0 and current_steps > 0:
#             save_checkpoint(model, CKPT_DIR, current_steps)
            
            
        bar_postfix = dict()
        bar_postfix['tls'] = loss.item()
        bar_postfix['pred'] = pred_count

        progress_bar.set_postfix(bar_postfix)

print("Finished training!")
# 100 0.25
# 200 0.41
# 300 0.76
# 400 0.77
# 500 0.784
# 600 0.787
# 800 0.79
# 950 0.8

  0%|          | 0/875 [00:00<?, ?it/s]

Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

100
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.03284212574362755}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

200
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.0892091765999794}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

300
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.031749095767736435}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

400
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.025462709367275238}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

500
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.0085136154666543}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

600
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.010556897148489952}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

700
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.016002792865037918}
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

800
{'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0, 'tls': 0.008799192495644093}
Finished training!


Validation part (15 mins)

In [31]:
model = load_model(model, 'model.15600_tls015.pt', 'checkpoints/sent_retrieval/e5_bs128_5e-06_neg1_top5')

In [41]:
print("Start final evaluations and write prediction files.")

print("Start validation")
probs = get_predicted_probs(model, eval_dataloader, device)
val_results = evaluate_retrieval(
    probs=probs,
    df_evidences=dev_evidences,
    ground_truths=DEV_GT,
    top_n=TOP_N,
    save_name=f"dev_doc{top_n_doc}sent{TOP_N}.jsonl",
)

print(f"Validation scores => {val_results}")


train_evidences = pair_with_wiki_sentences_eval(
    mapping=mapping,
    df=pd.DataFrame(TRAIN_GT),
)
train_set = SentRetrievalBERTDataset(train_evidences, tokenizer)
train_dataloader = DataLoader(train_set, batch_size=TEST_BATCH_SIZE)

print("Start calculating training scores")
probs = get_predicted_probs(model, train_dataloader, device)
train_results = evaluate_retrieval(
    probs=probs,
    df_evidences=train_evidences,
    ground_truths=TRAIN_GT,
    top_n=TOP_N,
    save_name=f"train_doc{top_n_doc}sent{TOP_N}.jsonl",
)
print(f"Training scores => {train_results}")


# Validation scores => {'F1 score': 0.03867961666946859, 'Precision': 0.019779400610371572, 'Recall': 0.8701923076923077}

Start final evaluations and write prediction files.
Start validation


  0%|          | 0/124 [00:00<?, ?it/s]

Validation scores => {'F1 score': 0.023715415019762848, 'Precision': 0.012, 'Recall': 1.0}
Start calculating training scores


  0%|          | 0/264 [00:00<?, ?it/s]

Training scores => {'F1 score': 0.02867259116600482, 'Precision': 0.014544814340588987, 'Recall': 1.0}


In [42]:
tmp = pd.DataFrame(load_json(f"data/train_doc{top_n_doc}sent{TOP_N}.jsonl"))
tmp['predicted_evidence'] = tmp['predicted_evidence'].apply(lambda x:list(map(tuple, x)))
(tmp['predicted_evidence'].apply(set).apply(len) == tmp['predicted_evidence'].apply(len)).mean()

1.0

### 只看supports 跟 refutes 的證據的正確率

In [43]:
val_doc_df = pd.DataFrame(load_json(f"data/dev_doc{top_n_doc}sent{TOP_N}.jsonl"))

In [44]:
correct_count = 0
N = 0
for i, row in val_doc_df.iterrows():
    if row['label'].upper() == 'NOT ENOUGH INFO':        
        continue
        
    N += 1
    pre_evi = set(map(tuple, row['predicted_evidence']))
    for evidence in row['evidence']:
        evidence = [(x[2], x[3]) for x in evidence]
        evidence = set(evidence)
        intersection = evidence.intersection(pre_evi)
        if len(intersection) == len(evidence):
            correct_count += 1
            break

val_evidence_acc = correct_count / N
print(f'val_evidence_acc: {val_evidence_acc:.4f}') # 應該要跟上面的 recall 一樣

val_evidence_acc: 1.0000


### Step 4. Check on our test data
(5 min)

In [45]:
# test_data = load_json(f"data/test_doc{top_n_doc}.jsonl")

test_evidences = pair_with_wiki_sentences_eval(
    mapping,
    test_data,
    is_testset=True,
)
test_set = SentRetrievalBERTDataset(test_evidences, tokenizer)
test_dataloader = DataLoader(test_set, batch_size=TEST_BATCH_SIZE)

print("Start predicting the test data")
probs = get_predicted_probs(model, test_dataloader, device)

test_result = evaluate_retrieval(
                probs=probs,
                df_evidences=test_evidences,
                ground_truths=test_data.to_dict(orient='records'),
                top_n=TOP_N,
                cal_scores=False,
                save_name=f"public+private_test_doc{top_n_doc}sent{TOP_N}.jsonl",
            )

Start predicting the test data


  0%|          | 0/241 [00:00<?, ?it/s]

In [46]:
tmp = pd.DataFrame(load_json(f"data/public+private_test_doc{top_n_doc}sent{TOP_N}.jsonl"))
tmp['predicted_evidence'] = tmp['predicted_evidence'].apply(lambda x:list(map(tuple, x)))
(tmp['predicted_evidence'].apply(set).apply(len) == tmp['predicted_evidence'].apply(len)).mean()

1.0

notebook3
# PART 3. Claim verification

import libs

In [1]:
import pickle
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm

import torch
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    BertModel,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

top_n_doc = 10
top_n_sent = 5

Global variables

In [2]:
LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

TRAIN_DATA = load_json(f"data/train_doc{top_n_doc}sent100.jsonl")
DEV_DATA = load_json(f"data/dev_doc{top_n_doc}sent100.jsonl")

TRAIN_PKL_FILE = Path(f"data/train_doc{top_n_doc}sent100.pkl")
DEV_PKL_FILE = Path(f"data/dev_doc{top_n_doc}sent100.pkl")

In [3]:
# 找出預測出來但不是證據句的，後續負採樣可以用
def get_wrong_pred_evidence(row):
    gt_all_evidence = []
    if row['label'] != "NOT ENOUGH INFO":
        for evidence in row['evidence']:
            for sent in evidence:
                gt_all_evidence.append(tuple(sent))
    else:
        return list(map(tuple, row['predicted_evidence']))
    
    gt_all_evidence = set(gt_all_evidence)
    pred_evidence = set(map(tuple, row['predicted_evidence']))    
    return list(pred_evidence - gt_all_evidence)
             
tmp = pd.DataFrame(TRAIN_DATA)
tmp['wrong_pred_evidence'] = tmp.apply(get_wrong_pred_evidence, axis=1)
tmp

TRAIN_DATA = tmp.to_dict(orient='records')
print("len TRAIN_DATA", len(TRAIN_DATA))
del tmp

len TRAIN_DATA 11039


In [4]:
def split_multi_evidence_to_instances(data):
    new_instances = []
    for instance in data:
        evidences = instance['evidence']
        if instance['label'] in ['supports', 'refutes']:
            for evidence in evidences:                
                _instance = instance.copy()
                _instance['evidence'] = [evidence]
                new_instances.append(_instance)
        else:
            new_instances.append(instance)
    
    return new_instances
    
TRAIN_DATA = split_multi_evidence_to_instances(TRAIN_DATA)
DEV_DATA = split_multi_evidence_to_instances(DEV_DATA)

train_df = pd.DataFrame(TRAIN_DATA)
train_df['evidence'].apply(lambda x:len(x[0])).value_counts()

# 去除單一個證據 超過5句的訓練資料
print(len(train_df))
train_df = train_df[train_df['evidence'].apply(lambda x:len(x[0])) <= 5]
train_df['evidence'].apply(lambda x:len(x[0])).value_counts()
TRAIN_DATA = train_df.to_dict(orient='records')
print(len(train_df))

11499
11488


In [5]:
# # 把 top10的sent變成top5
for instance in TRAIN_DATA:
    instance['predicted_evidence'] = instance['predicted_evidence'][:top_n_sent]
    
for instance in DEV_DATA:
    instance['predicted_evidence'] = instance['predicted_evidence'][:top_n_sent]
    

for instance in TRAIN_DATA:
    if len(instance['predicted_evidence'])>5:
        print(instance)
    
for instance in DEV_DATA:
    if len(instance['predicted_evidence'])>5:
        print(instance)

Preload wiki database (same as part 2.)

In [6]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages,)
del wiki_pages

Reading and concatenating jsonl files in data/wiki-pages
Generate parse mapping
Transform to id to evidence_map mapping


### Helper function

AICUP dataset with top-k evidence sentences.

Evaluation function

In [7]:
def calculate_strict_acc(y_true, y_pred, val_df):
    correct_count = 0
    for i, row in val_df[y_true == y_pred].iterrows():
        if row['label'].upper() == 'NOT ENOUGH INFO':
            correct_count += 1
            continue

        pre_evi = set(map(tuple, row['predicted_evidence']))
        for evidence in row['evidence']:
            evidence = [(x[2], x[3]) for x in evidence]
            evidence = set(evidence)
            intersection = evidence.intersection(pre_evi)
            if len(intersection) == len(evidence):
                correct_count += 1
                break

    val_strict_acc = correct_count / len(val_df)
    return val_strict_acc

def run_evaluation(model: torch.nn.Module, dataloader: DataLoader, device, use_tqdm=True):
    model.eval()

    loss = 0
    y_true = []
    y_pred = []
    
    def tqdm_wrapper(iterable):
        if use_tqdm:
            tqdm(iterable)
        return iterable
    
    with torch.no_grad():
        for batch in tqdm_wrapper(dataloader):
            y_true.extend(batch["labels"].tolist())

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
#             loss += outputs.loss.item()
#             logits = outputs.logits

            loss += ce_loss(outputs, batch['labels'])
            logits = outputs
            y_pred.extend(torch.argmax(logits, dim=1).tolist())

    acc = accuracy_score(y_true, y_pred)
    
    val_df = dataloader.dataset.data
    strict_acc = calculate_strict_acc(np.array(y_true), np.array(y_pred), val_df)
    
    y_pred_count = [0] * 3
    for yp in y_pred:
        y_pred_count[yp] += 1
        
    return {"val_loss": loss / len(dataloader), "val_acc": acc, "val_strict_acc": strict_acc, 
            'y_pred':y_pred_count, 'y_pred_label':y_pred}

Prediction

In [8]:
def run_predict(model: torch.nn.Module, test_dl: DataLoader, device) -> list:
    model.eval()

    preds = []
    for batch in tqdm(test_dl,
                      total=len(test_dl),
                      leave=False,
                      desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
#         pred = model(**batch).logits
        pred = model(**batch)
        pred = torch.argmax(pred, dim=1)
        preds.extend(pred.tolist())
    return preds

### Main function

In [9]:
def join_with_topk_evidence(
    df: pd.DataFrame,
    mapping: dict,
    mode: str = "train",
    topk: int = 5,
) -> pd.DataFrame:
    """join_with_topk_evidence join the dataset with topk evidence.

    Note:
        After extraction, the dataset will be like this:
               id     label         claim                           evidence            evidence_list
        0    4604  supports       高行健...     [[[3393, 3552, 高行健, 0], [...  [高行健 （ ）江西赣州出...
        ..    ...       ...            ...                                ...                     ...
        945  2095  supports       美國總...  [[[1879, 2032, 吉米·卡特, 16], [...  [卸任后 ， 卡特積極參與...
        停各种战争及人質危機的斡旋工作 ， 反对美国小布什政府攻打伊拉克...

        [946 rows x 5 columns]

    Args:
        df (pd.DataFrame): The dataset with evidence.
        wiki_pages (pd.DataFrame): The wiki pages dataframe
        topk (int, optional): The topk evidence. Defaults to 5.
        cache(Union[Path, str], optional): The cache file path. Defaults to None.
            If cache is None, return the result directly.

    Returns:
        pd.DataFrame: The dataset with topk evidence_list.
            The `evidence_list` column will be: List[str]
    """

    # format evidence column to List[List[Tuple[str, str, str, str]]]
    if "evidence" in df.columns:
        df["evidence"] = df["evidence"].map(
            lambda x: [[x]] if not isinstance(x[0], list) else [x]
            if not isinstance(x[0][0], list) else x)

    print(f"Extracting evidence_list for the {mode} mode ...")
    if mode == "eval":
        # extract evidence
        df["evidence_list"] = df["predicted_evidence"].map(lambda x: [
            mapping.get(evi_id, {}).get(str(evi_idx), "")
            for evi_id, evi_idx in x  # for each evidence list
        ][:topk] if isinstance(x, list) else [])
        print(df["evidence_list"][:5])
    else:
        # extract evidence
        df["evidence_list"] = df["evidence"].map(lambda x: [
            " ".join([  # join evidence
                mapping.get(evi_id, {}).get(str(evi_idx), "")
                for _, _, evi_id, evi_idx in evi_list
            ]) if isinstance(evi_list, list) else ""
            for evi_list in x  # for each evidence list
        ][:topk] if isinstance(x, list) else [])

    return df

### Step 1. Setup training environment

Hyperparams

In [10]:
#@title  { display-mode: "form" }

# MODEL_NAME = "bert-base-chinese"  #@param {type:"string"}
# MODEL_NAME = "xlm-roberta-base"  #@param {type:"string"}

MODEL_NAME = "ckiplab/bert-base-chinese"  #@param {type:"string"}
# MODEL_NAME = "ckiplab/albert-base-chinese"  #@param {type:"string"}
# MODEL_NAME = "ckiplab/albert-tiny-chinese"  #@param {type:"string"}


TRAIN_BATCH_SIZE = 40  #@param {type:"integer"}
TEST_BATCH_SIZE = 32  #@param {type:"integer"}
SEED = 1028  #@param {type:"integer"}
LR = 1e-5  #@param {type:"number"}
NUM_EPOCHS = 20  #@param {type:"integer"}
MAX_SEQ_LEN = 512  #@param {type:"integer"}
EVIDENCE_TOPK = 5  #@param {type:"integer"}
VALIDATION_STEP = 50  #@param {type:"integer"}

Experiment Directory

In [11]:
OUTPUT_FILENAME = "submission.jsonl"

EXP_DIR = f"claim_verification/{MODEL_NAME}-e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_{LR}_top{EVIDENCE_TOPK}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

### Step 2. Concat claim and evidences
join topk evidence

In [12]:
def gen_neg_evidences(evidences):
    from itertools import combinations
    neg_evidences = []
    if len(evidences) > 1:        
        for take_num in range(1, len(evidences)):
            for neg_evidence in combinations(evidences, len(evidences)-take_num):  
                neg_evidence = list(neg_evidence)
                neg_evidences.append(neg_evidence)
    return neg_evidences

gen_neg_evidences([['a'], ['b'], ['c']])

[[['a'], ['b']], [['a'], ['c']], [['b'], ['c']], [['a']], [['b']], [['c']]]

In [13]:
def gen_neg_instances_from_data(data):
    neg_instances = []
    for instance in data:
        evidences = instance['evidence']
        if instance['label'] in ['supports', 'refutes']:
            for evidence in evidences:
                negs = gen_neg_evidences(evidence)
                for neg in negs:
                    neg_instance = instance.copy()
                    neg_instance['evidence'] = [neg]
                    neg_instance['label'] = 'NOT ENOUGH INFO'
                    neg_instances.append(neg_instance)
    print('產生的負樣本數:', len(neg_instances))
    return neg_instances
    
train_neg = pd.DataFrame(gen_neg_instances_from_data(TRAIN_DATA))
display(train_neg.head(3))

產生的負樣本數: 11122


Unnamed: 0,id,label,claim,evidence,predicted_pages,predicted_evidence,wrong_pred_evidence
0,3976,NOT ENOUGH INFO,在傳播史中屬於電子傳播的廣播，因爲廣播本身發送成本低 、 接收容易的特性，在二戰電視事業中斷...,"[[[2937, 3132, 傳播史, 34]]]","[電視, 臺灣媒體史, TBS電視臺, 短波廣播, 控制器區域網路, 美國電臺, 電臺廣播,...","[[傳播史, 34], [廣播, 0], [廣播, 14], [傳播史, 16], [電臺廣...","[(大日本帝國, 24), (傳播史, 6), (六七暴動, 6), (廣告, 4), (朴..."
1,3976,NOT ENOUGH INFO,在傳播史中屬於電子傳播的廣播，因爲廣播本身發送成本低 、 接收容易的特性，在二戰電視事業中斷...,"[[[2937, 3132, 廣播, 0]]]","[電視, 臺灣媒體史, TBS電視臺, 短波廣播, 控制器區域網路, 美國電臺, 電臺廣播,...","[[傳播史, 34], [廣播, 0], [廣播, 14], [傳播史, 16], [電臺廣...","[(大日本帝國, 24), (傳播史, 6), (六七暴動, 6), (廣告, 4), (朴..."
2,11139,NOT ENOUGH INFO,由中國近代兩位教育家創辦的南開大學位於中國天津市共三個校區。,"[[[10030, 9111, 南開大學, 48]]]","[南開大學校史, 張伯苓, 南開大學, 中國, 天津大學, 天津教育, 教育家, 近代, 天...","[[南開大學校史, 0], [南開大學, 48], [天津教育, 4], [南開大學, 65...","[(南開大學校史, 13), (張伯苓, 7), (南開大學校史, 22), (南開大學, ..."


In [14]:
def id2sent(evidences):
    sents = []
    for evidence in evidences:        
        for e in evidence:
            if type(e) is list:
                sent = mapping.get(e[2], {}).get(str(e[3]), "")
                sents.append(sent)    
    return sents

def id2sent_2(evidence):
    sents = []
    for e in evidence:        
        if type(e) is tuple:
            sent = mapping.get(e[0], {}).get(str(e[1]), "")
            sents.append(sent)
    return sents

In [15]:
train_neg_df = train_neg.copy()
# train_neg_df["evidence_list"] = train_neg_df["evidence"].apply(id2sent)
# train_neg_df['evidence_list'].apply(len).value_counts()

In [16]:
df = pd.DataFrame(TRAIN_DATA)
no_evidence = join_with_topk_evidence(
    df.loc[df['label']=='NOT ENOUGH INFO', :].copy(),
    mapping,
    topk=EVIDENCE_TOPK,
    mode='eval'
)

train_df = df.copy()
# train_df["evidence_list"] = train_df["evidence"].apply(id2sent)

# # train_df.loc[
# #     train_df['label']=='NOT ENOUGH INFO', 'evidence_list'] = no_evidence['evidence_list']
# # train_df = train_df[~(train_df['evidence_list'].apply(len) > top_n_sent)]

# train_df['wrong_pred_evidence'] = train_df["wrong_pred_evidence"].apply(id2sent_2)
# train_df['evidence_list'].apply(len).value_counts()

Extracting evidence_list for the eval mode ...
1     [2010臺北國際花卉博覽會 ， 簡稱臺北花博 、 臺北國際花博 ， 2010年11月6日至...
5     [臺中中央公園 （ 英文 ： Taichung Central Park ） 是位於臺灣台中...
6     [後因東線戰場上魯格手槍彈顯得威力不足 ， 遂以 7.92 × 57毫米毛瑟步槍彈爲基礎 ，...
7     [小行星4833  是一顆圍繞太陽公轉的小行星 。, 特洛伊群小行星是與木星共用軌道 ， 一...
12    [奧斯卡 · 卡爾 · 奧洛夫  ， 是一位瑞典王子 ， 他是維多利亞女王儲和丹尼爾親王的第...
Name: evidence_list, dtype: object


In [17]:
dev_df = join_with_topk_evidence(
    pd.DataFrame(DEV_DATA),
    mapping,
    mode="eval",
    topk=EVIDENCE_TOPK,
)
# df = pd.DataFrame(DEV_DATA)
# no_evidence = join_with_topk_evidence(
#     df.loc[df['label']=='NOT ENOUGH INFO', :].copy(),
#     mapping,
#     topk=EVIDENCE_TOPK,
#     mode='eval'
# )

# dev_df = df.copy()
# dev_df["evidence_list"] = dev_df["evidence"].apply(id2sent)
# dev_df.loc[
#     dev_df['label']=='NOT ENOUGH INFO', 'evidence_list'] = no_evidence['evidence_list']
# dev_df = dev_df[~(dev_df['evidence_list'].apply(len) > 5)]

dev_df['evidence_list'].apply(len).value_counts()

Extracting evidence_list for the eval mode ...
0    [多米尼加共和國 （ República Dominicana ） 是位於加勒比海的島國 ，...
1    [澳門特別行政區2010年度勳章 、 獎章和獎狀名單于2010年12月19日由澳門特別行政區...
2    [乃王母頤養生息之天庭別府 ， 名爲 — — 別有洞天 ， 此亦是瑤池之所在 。, 瑤池 （...
3    [“ 野果 ” 則是相對於水果 ， 生長於非果園或野地植物所結的果實 ， 有些可食但水分不多...
4    [韭菜採收容易 ， 可用於園藝 ， 在世界廣泛種植 ， 並在一些國家被列爲入侵植物 。, 韭...
Name: evidence_list, dtype: object


evidence_list
5    601
Name: count, dtype: int64

In [18]:
train_pos_neg_df = pd.concat((train_df, train_neg_df)).reset_index(drop=True)
train_pos_neg_df["evidence_list"] = train_pos_neg_df["evidence"].apply(id2sent)
train_pos_neg_df['wrong_pred_evidence'] = train_pos_neg_df["wrong_pred_evidence"].apply(id2sent_2)
train_pos_neg_df['evidence_list'].apply(len).value_counts()

evidence_list
1    12370
2     5313
0     3148
3     1508
4      248
5       23
Name: count, dtype: int64

In [19]:
# 有抽樣預測的證據句 到 evidence list中 並shuffle
class AicupTopkEvidenceBERTDataset(BERTDataset):
    """AICUP dataset with top-k evidence sentences."""

    def __init__(self, is_eval, **kwargs):
        super().__init__(**kwargs)
        self.is_eval = is_eval
    
    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        claim = item["claim"]
        evidence = item["evidence_list"].copy()
        
        if not self.is_eval:
            wrong_pred_evidence = item["wrong_pred_evidence"]
            sample_num = self.topk - len(evidence)
            sample_num = min(sample_num, len(wrong_pred_evidence))
            noise = np.random.choice(wrong_pred_evidence, sample_num, replace=False)
            evidence.extend(noise)
            np.random.shuffle(evidence)
                    
        # In case there are less than topk evidence sentences
        pad_num = self.topk - len(evidence)
        pad = ["[PAD]"] * (pad_num)
        evidence += pad
        
        claim_evidence_list = [claim, *evidence]
        concat_claim_evidence = " [SEP] ".join(claim_evidence_list)
#         print(concat_claim_evidence)

        concat = self.tokenizer(
            concat_claim_evidence,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            truncation=True,
        )
        label = LABEL2ID[item["label"]] if "label" in item else -1
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}

        if "label" in item:
            concat_ten["labels"] = torch.tensor(label)

        return concat_ten

### Step 3. Training

Prevent CUDA out of memory

In [20]:
torch.cuda.empty_cache()

In [21]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = AicupTopkEvidenceBERTDataset(
    is_eval = False,
    data=train_pos_neg_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,    
)
val_dataset = AicupTopkEvidenceBERTDataset(
    is_eval = True,
    data=dev_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

In [22]:
train_pos_neg_df['evidence_list'].apply(len).value_counts()

evidence_list
1    12370
2     5313
0     3148
3     1508
4      248
5       23
Name: count, dtype: int64

In [23]:
from torch import nn

def MLP_bn_relu_drop(neuron_num, drop_p, out_ac='relu', batch_norm=True):
        """
        Example:
            MLP_bn_relu_drop(neuron_num=[64, 32, 16], drop_p=0.2)
            代表2層的全連接層，輸入為64，經過第一層變32，再來變16。
            然後每層有dropout 0.2，及BatchNorm+ReLU

            out_ac: 'relu' or 'tanh' 代表輸出層要用的activation function
        
        """
        dropout = nn.Dropout(drop_p)
        modules = []
        for i, n in enumerate(neuron_num):
            if i == 0: continue
            fc = nn.Linear(neuron_num[i-1], n)
            nn.init.kaiming_normal_(fc.weight)
            
            modules.append(fc)

            if batch_norm:
                modules.append(nn.BatchNorm1d(n))

            if i == len(neuron_num)-1: # output layer
                if out_ac == 'relu':
                    modules.append(nn.ReLU())
                elif out_ac == 'tanh':
                    modules.append(nn.Tanh())
                else:
                    raise 'Not supported output activation function'
            else:
                modules.append(nn.ReLU())

            modules.append(dropout)
            
        model = nn.Sequential(*modules)
        return model

class Classifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(Classifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)        
        self.dropout = nn.Dropout(0.1)
        self.mlp = MLP_bn_relu_drop([self.bert.config.hidden_size, 768], drop_p=0.2)
        self.out = nn.Linear(768, num_classes)
        
        nn.init.kaiming_normal_(self.out.weight)

    def forward(self, **kwargs):
        bert_out = self.bert.forward(input_ids=kwargs['input_ids'],
                                     attention_mask=kwargs['attention_mask'],
                                     token_type_ids=kwargs['token_type_ids'])
        
        pooled_output = bert_out.pooler_output
        output = self.dropout(pooled_output)
        output = self.mlp(pooled_output)
        output = self.out(output)
        return output

In [24]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

model = Classifier(MODEL_NAME, 3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=1e-4)

num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

writer = SummaryWriter(LOG_DIR)

Some weights of the model checkpoint at ckiplab/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training (30 mins)

In [25]:
display(train_pos_neg_df['label'].value_counts())
display(dev_df['label'].value_counts() / len(dev_df))

label
NOT ENOUGH INFO    14270
supports            5117
refutes             3223
Name: count, dtype: int64

label
supports           0.445923
refutes            0.279534
NOT ENOUGH INFO    0.274542
Name: count, dtype: float64

In [26]:
# 计算类别权重
def compute_class_weights(labels):
    class_samples = torch.bincount(labels)
    total_samples = len(labels)
    class_weights = total_samples / (len(class_samples) * class_samples)
    return class_weights

# 创建样本标签和权重张量
tmp = torch.Tensor(train_pos_neg_df["label"].apply(lambda x:LABEL2ID[x])).to(torch.int8)  # 样本标签
class_weights = compute_class_weights(tmp)  # 类别权重

# 定义损失函数
from torch import nn
# class_weights[2] = class_weights[2]*10
ce_loss = nn.CrossEntropyLoss(weight=class_weights.to(device))
# ce_loss = nn.CrossEntropyLoss()
class_weights

tensor([1.4729, 2.3384, 0.5281])

In [73]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0
val_loss = None
val_acc = None
pred_count = None

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
#         loss = outputs.loss
#         loss = ce_loss(outputs.logits, batch['labels'])
        
        loss = ce_loss(outputs, batch['labels'])
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()        
        progress_bar.update(1)
        writer.add_scalar("training_loss", loss.item(), current_steps)        

#         y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_pred = torch.argmax(outputs, dim=1).tolist()
        y_true = batch["labels"].tolist()

        current_steps += 1

        
        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            print("Start validation")
            val_results = run_evaluation(
                model, eval_dataloader, device, use_tqdm=False)
            
            val_loss = float(val_results['val_loss'].detach())
            val_acc = val_results['val_acc']
            val_strict_acc = val_results['val_strict_acc']
            pred_count = str(val_results['y_pred'])
            print(current_steps,
                  f"tls= {loss.item():.4f}, v_loss={val_loss:.4f}",
                  f"v_acc={val_acc:.4f}, v_s_acc={val_strict_acc:.4f}, count={pred_count}, ")
            
            if val_acc>=0.69:
                save_checkpoint(
                    model,
                    CKPT_DIR,
                    current_steps,
                    mark=f"val_acc={val_results['val_acc']:.4f}",
                )
    
#         if current_steps % 100 == 0 and current_steps > 0: 
#             save_checkpoint(
#                 model,
#                 CKPT_DIR,
#                 current_steps,
#                 mark=f"val_acc={val_results['val_acc']:.4f}",
#             )
    
        bar_postfix = dict()
        bar_postfix['tls'] = loss.item()
        bar_postfix['val_loss'] = val_loss
        bar_postfix['val_acc'] = val_acc
        bar_postfix['pred'] = pred_count

        progress_bar.set_postfix(bar_postfix)


print("Finished training!")

  0%|          | 0/80 [00:00<?, ?it/s]

Start validation
50 tls= 0.1781, v_loss=1.3426 v_acc=0.1667, v_s_acc=0.1667, count=[2, 3, 1], 
Finished training!


## 讀取最好的model看validation的strict acc

In [30]:
ckpt_name = 'val_acc=0.7038+vloss=1.04_model.3700.pt'
model = load_model(model, ckpt_name,
                   'checkpoints/claim_verification/ckiplab/bert-base-chinese-e20_bs40_1e-05_top5')

In [31]:
val_results = run_evaluation(
                model, eval_dataloader, device, use_tqdm=False)

In [32]:
val_df = eval_dataloader.dataset.data
val_y_true = val_df['label'].apply(lambda x:LABEL2ID[x]).values
val_y_pred = np.array(val_results['y_pred_label'])
val_acc = (val_y_true == val_y_pred).mean()

In [33]:
set_len = val_df['predicted_evidence'].apply(lambda x:set(map(tuple, x))).apply(lambda x:len(x))
original_len = val_df['predicted_evidence'].apply(lambda x:len(x))
redundent_evi_num = (set_len != original_len).sum()
print("有多少筆資料證據句有預測到重覆的:", redundent_evi_num)
redundent_evi_num / len(val_df)

val_df[set_len != original_len]

有多少筆資料證據句有預測到重覆的: 0


Unnamed: 0,id,label,claim,evidence,predicted_pages,predicted_evidence,evidence_list


In [34]:
correct_count = 0
for i, row in val_df[val_y_true == val_y_pred].iterrows():
    if row['label'].upper() == 'NOT ENOUGH INFO':
        correct_count += 1
        continue
     
    pre_evi = set(map(tuple, row['predicted_evidence']))
    for evidence in row['evidence']:
        evidence = [(x[2], x[3]) for x in evidence]
        evidence = set(evidence)
        intersection = evidence.intersection(pre_evi)
        if len(intersection) == len(evidence):
            correct_count += 1
            break

val_strict_acc = correct_count / len(val_df)
print(f'label acc: {val_acc:.4f}')
print(f'strict acc: {val_strict_acc:.4f}')

label acc: 0.7038
strict acc: 0.6339


### Step 4. Make your submission

In [35]:
TEST_DATA = load_json(f"data/public+private_test_doc10sent100.jsonl")

# 把 top10的sent變成top5
for instance in TEST_DATA:
    instance['predicted_evidence'] = instance['predicted_evidence'][:top_n_sent]

for instance in TEST_DATA:
    if len(instance['predicted_evidence'])>top_n_sent:
        print(instance)

test_df = join_with_topk_evidence(
    pd.DataFrame(TEST_DATA),
    mapping,
    mode="eval",
    topk=EVIDENCE_TOPK,
)

test_dataset = AicupTopkEvidenceBERTDataset(
    is_eval = True,
    data=test_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)
test_dataloader = DataLoader(test_dataset, batch_size=32)

Extracting evidence_list for the eval mode ...
0    [顯微鏡泛指將微小不可見或難見物品之影像放大 ， 而能被肉眼或其他成像儀器觀察之工具 。, ...
1    [蠶產絲 ， 蜜蜂產蜂蜜 ， 兩者都已被人類馴化 。, 許多昆蟲被認爲是對生態有益的捕食者 ...
2    [綠山城縣  ， 是波蘭的縣份 ， 位於該國西部 ， 由盧布斯卡省負責管轄 ， 首府設於綠山...
3    [《 魂斷藍橋 》 （ Waterloo Bridge ） 是美國黑白電影 ， 由米高梅電影...
4    [2015年以 《 刺客聶隱娘 》 獲得第68屆坎城影展最佳導演獎及第52屆金馬獎最佳導演獎...
Name: evidence_list, dtype: object


In [36]:
set_len = test_df['predicted_evidence'].apply(lambda x:set(map(tuple, x))).apply(lambda x:len(x))
original_len = test_df['predicted_evidence'].apply(lambda x:len(x))
redundent_evi_num = (set_len != original_len).sum()
print("有多少筆資料證據句有預測到重覆的:", redundent_evi_num)
redundent_evi_num / len(test_df)

test_df[set_len != original_len]

有多少筆資料證據句有預測到重覆的: 1


Unnamed: 0,id,claim,predicted_pages,hanlp_results,google_search_pages,predicted_evidence,evidence_list
6802,12924,知道自己處於妊娠狀態但流產的婦女有一到兩成。,"[狼, 女性健康, 性傾向, 美國非道德人體實驗, 林鄭月娥, 吸血新世紀4：破曉傳奇上集,...","[知道自己處於妊娠狀態但流產的婦女, 自己, 妊娠狀態, 婦女]","[女性健康, 表親婚, 性取向, 美國非道德人體實驗, 安妮_(大不列顛君主), 性傾向, ...","[[女性健康, 13], [安妮_(英國女王), 6], [安妮_(英國女王), 6], [...",[就算已開發國家 ， 妊娠及分娩仍對女性有一定的風險 ， 每世界每年孕產婦死亡的人數爲25萬...


Prediction

In [None]:
predicted_label = run_predict(model, test_dataloader, device)

Predicting:   0%|          | 0/283 [00:00<?, ?it/s]

In [None]:
display(pd.Series(predicted_label).value_counts())
pd.Series(predicted_label).value_counts() / len(predicted_label)

# DEV
# supports           0.445923
# refutes            0.279534
# NOT ENOUGH INFO    0.274542

# MAX SEP LEN = 312
# 1    537
# 0    344
# 2    108

# MAX SEP LEN = 512 publci + private
# 0    4399
# 1    2352
# 2    2287

In [None]:
predict_dataset = test_df.copy()
predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))

predict_dataset.loc[predict_dataset['predicted_evidence'].apply(len)==0, 'predicted_label'] = 'NOT ENOUGH INFO'
predict_dataset[["id", "predicted_label", "predicted_evidence"]].to_json(
    f"{ckpt_name}-{OUTPUT_FILENAME}",
    orient="records",
    lines=True,
    force_ascii=False,
)