In [7]:
import os
import re
import json
import random
import pickle
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Set, Tuple, Union

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

In [8]:
seed = 42

def set_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)  # 如果有使用多個 GPU
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(seed)

In [9]:
LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

In [10]:
def run_predict(model: torch.nn.Module, test_dl: DataLoader, device) -> list:
    model.eval()

    preds = []
    for batch in tqdm(test_dl,
                      total=len(test_dl),
                      leave=False,
                      desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds.extend(logits.tolist())
    return np.array(preds)

In [11]:
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
SEED = 42
MAX_SEQ_LEN = 256

In [12]:
class AicupTopkEvidenceBERTDataset(BERTDataset):
    """AICUP dataset with top-k evidence sentences."""

    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        claim = item["claim"]
        evidence = item["evidence_list"]

        # In case there are less than topk evidence sentences
        pad = ["[PAD]"] * (self.topk - len(evidence))
        evidence += pad
        concat_claim_evidence = " [SEP] ".join([claim, *evidence])

        concat = self.tokenizer(
            concat_claim_evidence,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
        )
        label = LABEL2ID[item["label"]] if "label" in item else -1
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}

        if "label" in item:
            concat_ten["labels"] = torch.tensor(label)

        return concat_ten

In [None]:
tokenizer_1 = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')
tokenizer_2 = AutoTokenizer.from_pretrained('bert-base-chinese')
tokenizer_3 = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
tokenizer_4 = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')
tokenizer_5 = AutoTokenizer.from_pretrained('hfl/chinese-lert-large')

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_1 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-roberta-wwm-ext-large',
    num_labels=len(LABEL2ID),
)
model_2 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-roberta-wwm-ext-large',
    num_labels=len(LABEL2ID),
)
model_3 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-roberta-wwm-ext-large',
    num_labels=len(LABEL2ID),
)
model_4 = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=len(LABEL2ID),
)
model_5 = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=len(LABEL2ID),
)
model_6 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-bert-wwm-ext',
    num_labels=len(LABEL2ID),
)
model_7 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-macbert-large',
    num_labels=len(LABEL2ID),
)
model_8 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-macbert-large',
    num_labels=len(LABEL2ID),
)
model_9 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-lert-large',
    num_labels=len(LABEL2ID),
)
model_10 = AutoModelForSequenceClassification.from_pretrained(
    'hfl/chinese-lert-large',
    num_labels=len(LABEL2ID),
)

model_1.to(device)
model_2.to(device)
model_3.to(device)
model_4.to(device)
model_5.to(device)
model_6.to(device)
model_7.to(device)
model_8.to(device)
model_9.to(device)
model_10.to(device)

In [15]:
model_1 = load_model(model_1, 'val_acc=0.7083_model.575.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_1)')
model_2 = load_model(model_2, 'val_acc=0.7250_model.1375.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_1)')
model_3 = load_model(model_3, 'val_acc=0.7233_model.800.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_1)')
model_4 = load_model(model_4, 'val_acc=0.6971_model.875.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_2)')
model_5 = load_model(model_5, 'val_acc=0.6923_model.800.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_2)')
model_6 = load_model(model_6, 'val_acc=0.6966_model.1725.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_3)')
model_7 = load_model(model_7, 'val_acc=0.6984_model.1225.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_4)')
model_8 = load_model(model_8, 'val_acc=0.6979_model.725.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_4)')
model_9 = load_model(model_9, 'val_acc=0.7276_model.1275.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_5)')
model_10 = load_model(model_10, 'val_acc=0.7276_model.850.pt', './checkpoints/claim_verification/e10_bs48_5e-05_top5(model_5)')

In [16]:
TEST_PKL_FILE = Path("./data/test_doc7sent5.pkl")

with open(TEST_PKL_FILE, "rb") as f:
    test_df = pickle.load(f)

test_dataset_1 = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer_1,
    max_length=MAX_SEQ_LEN,
)
test_dataset_2 = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer_2,
    max_length=MAX_SEQ_LEN,
)
test_dataset_3 = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer_3,
    max_length=MAX_SEQ_LEN,
)
test_dataset_4 = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer_4,
    max_length=MAX_SEQ_LEN,
)
test_dataset_5 = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer_5,
    max_length=MAX_SEQ_LEN,
)


test_dataloader_1 = DataLoader(test_dataset_1, batch_size=TEST_BATCH_SIZE)
test_dataloader_2 = DataLoader(test_dataset_2, batch_size=TEST_BATCH_SIZE)
test_dataloader_3 = DataLoader(test_dataset_3, batch_size=TEST_BATCH_SIZE)
test_dataloader_4 = DataLoader(test_dataset_4, batch_size=TEST_BATCH_SIZE)
test_dataloader_5 = DataLoader(test_dataset_5, batch_size=TEST_BATCH_SIZE)

In [None]:
logit_1 = run_predict(model_1, test_dataloader_1, device)
logit_2 = run_predict(model_2, test_dataloader_1, device)
logit_3 = run_predict(model_3, test_dataloader_1, device)
logit_4 = run_predict(model_4, test_dataloader_2, device)
logit_5 = run_predict(model_5, test_dataloader_2, device)
logit_6 = run_predict(model_6, test_dataloader_3, device)
logit_7 = run_predict(model_7, test_dataloader_4, device)
logit_8 = run_predict(model_8, test_dataloader_4, device)
logit_9 = run_predict(model_9, test_dataloader_5, device)
logit_10 = run_predict(model_10, test_dataloader_5, device)

In [18]:
pred = (logit_1*2+logit_2*2+logit_3+logit_4+logit_5+logit_6+logit_7+logit_8*2+logit_9+logit_10)/13
predicted_label = torch.argmax(torch.Tensor(pred), dim=1).tolist()

In [19]:
OUTPUT_FILENAME = 'submission(public_test).jsonl'

predict_dataset = test_df.copy()
predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))
predict_dataset[["id", "predicted_label", "predicted_evidence"]].to_json(
    OUTPUT_FILENAME,
    orient="records",
    lines=True,
    force_ascii=False,
)

In [21]:
PRIVATE_PKL_FILE = Path("./data/private_doc7sent5.pkl")

with open(PRIVATE_PKL_FILE, "rb") as f:
    private_df = pickle.load(f)

private_dataset_1 = AicupTopkEvidenceBERTDataset(
    private_df,
    tokenizer=tokenizer_1,
    max_length=MAX_SEQ_LEN,
)
private_dataset_2 = AicupTopkEvidenceBERTDataset(
    private_df,
    tokenizer=tokenizer_2,
    max_length=MAX_SEQ_LEN,
)
private_dataset_3 = AicupTopkEvidenceBERTDataset(
    private_df,
    tokenizer=tokenizer_3,
    max_length=MAX_SEQ_LEN,
)
private_dataset_4 = AicupTopkEvidenceBERTDataset(
    private_df,
    tokenizer=tokenizer_4,
    max_length=MAX_SEQ_LEN,
)
private_dataset_5 = AicupTopkEvidenceBERTDataset(
    private_df,
    tokenizer=tokenizer_5,
    max_length=MAX_SEQ_LEN,
)

private_dataloader_1 = DataLoader(private_dataset_1, batch_size=TEST_BATCH_SIZE)
private_dataloader_2 = DataLoader(private_dataset_2, batch_size=TEST_BATCH_SIZE)
private_dataloader_3 = DataLoader(private_dataset_3, batch_size=TEST_BATCH_SIZE)
private_dataloader_4 = DataLoader(private_dataset_4, batch_size=TEST_BATCH_SIZE)
private_dataloader_5 = DataLoader(private_dataset_5, batch_size=TEST_BATCH_SIZE)

In [None]:
logit_1 = run_predict(model_1, private_dataloader_1, device)
logit_2 = run_predict(model_2, private_dataloader_1, device)
logit_3 = run_predict(model_3, private_dataloader_1, device)
logit_4 = run_predict(model_4, private_dataloader_2, device)
logit_5 = run_predict(model_5, private_dataloader_2, device)
logit_6 = run_predict(model_6, private_dataloader_3, device)
logit_7 = run_predict(model_7, private_dataloader_4, device)
logit_8 = run_predict(model_8, private_dataloader_4, device)
logit_9 = run_predict(model_9, private_dataloader_5, device)
logit_10 = run_predict(model_10, private_dataloader_5, device)

In [23]:
pred = (logit_1*2+logit_2*2+logit_3+logit_4+logit_5+logit_6+logit_7+logit_8*2+logit_9+logit_10)/13
predicted_label = torch.argmax(torch.Tensor(pred), dim=1).tolist()

In [24]:
OUTPUT_FILENAME = 'submission(private_test).jsonl'

private_predict_dataset = private_df.copy()
private_predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))
private_predict_dataset[["id", "predicted_label", "predicted_evidence"]].to_json(
    OUTPUT_FILENAME,
    orient="records",
    lines=True,
    force_ascii=False,
)

In [26]:
test_all = pd.concat([predict_dataset[["id", "predicted_label", "predicted_evidence"]], private_predict_dataset[["id", "predicted_label", "predicted_evidence"]]]).reset_index(drop=True)

In [27]:
OUTPUT_FILENAME = 'submission(all_test).jsonl'

private_predict_dataset = private_df.copy()
private_predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))
test_all.to_json(
    OUTPUT_FILENAME,
    orient="records",
    lines=True,
    force_ascii=False,
)