In [1]:
import re
import jieba
import scipy
import numpy as np
import pandas as pd
from pathlib import Path
from functools import partial

# Download "dict.txt.big" from https://github.com/fxsjy/jieba
jieba.set_dictionary("../data/dict.txt.big")

from sklearn.feature_extraction.text import TfidfVectorizer

from pandarallel import pandarallel
# Adjust the number of workers if you want
pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

from tqdm import tqdm
tqdm.pandas() # for progress_apply

from tfidf_utils import (
    load_json,
    jsonl_dir_to_df,
    calculate_precision,
    calculate_recall,
)

In [2]:
# Get the stopwords
# https://github.com/bryanchw/Traditional-Chinese-Stopwords-and-Punctuations-Library
from TCSP import read_stopwords_list

stopwords = read_stopwords_list()

In [3]:
def tokenize(text: str, stopwords: list) -> str:

    tokens = jieba.lcut(text)

    return " ".join([w for w in tokens if w not in stopwords])

In [4]:
def get_pred_docs_sklearn(
    claim: str,
    tokenizing_method: callable,
    vectorizer: TfidfVectorizer,
    tf_idf_matrix: scipy.sparse.csr_matrix,
    wiki_pages: pd.DataFrame,
    topk: int,
) -> set:
    
    tokens = tokenizing_method(claim)
    claim_vector = vectorizer.transform([tokens])
    similarity_scores = tf_idf_matrix.dot(claim_vector.T)

    # `similarity_scores` shape: (num_wiki_pages x 1)
    similarity_scores = similarity_scores.toarray()[:, 0]  # flatten the array

    # Sort the similarity scores in descending order
    sorted_indices = np.argsort(similarity_scores)[::-1]
    topk_sorted_indices = sorted_indices[:topk]

    # Get the wiki page names based on the topk sorted indices 
    results = wiki_pages.iloc[topk_sorted_indices]["id"]

    exact_matchs = []
    
    # Basically, we check if a result is exactly not mentioned in the claim.
    for result in results:
        if (
            (result not in claim)
            or (result not in claim.replace(" ", "")) # E.g., MS DOS -> MSDOS
            or (result.replace("·", "") not in claim) # E.g., 湯姆·克魯斯 -> 湯姆克魯斯
            or (result.replace("-", "") not in claim) # E.g., X-SAMPA -> XSAMPA
        ):
            exact_matchs.append(result)
        elif "·" not in result:
            splitted = result.split("·") # E.g., 阿爾伯特·愛因斯坦 -> 愛因斯坦
            for split in splitted:
                if split not in claim:
                    exact_matchs.append(result)
                    break

    return set(exact_matchs)

In [5]:
wiki_cache = "wiki"
target_column = "text"
wiki_path = "../data/wiki-pages"

wiki_cache_path = Path(f"../data/{wiki_cache}.pkl")
if wiki_cache_path.exists():
    wiki_pages = pd.read_pickle(wiki_cache_path)
else:
    # You need to download `wiki-pages.zip` from the AICUP website
    wiki_pages = jsonl_dir_to_df(wiki_path)
    # wiki_pages are combined into one dataframe, so we need to reset the index
    wiki_pages = wiki_pages.reset_index(drop=True)

    # tokenize the text and keep the result in a new column `processed_text`
    wiki_pages["processed_text"] = wiki_pages[target_column].parallel_apply(
        partial(tokenize, stopwords=stopwords)
    )
    # save the result to a pickle file
    wiki_pages.to_pickle(wiki_cache_path, protocol=4)

In [6]:
# Hyperparameters

min_wiki_length = 5
topk = 1
min_df = 0
max_df = 0.8
use_idf = True
sublinear_tf = True
norm = None

# Build the TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=max_df,
    min_df=min_df,
    use_idf=use_idf,
    sublinear_tf=sublinear_tf,
    norm=norm,
    stop_words=stopwords,
    token_pattern=r"(?u)\b\w+\b",
)

In [7]:
wiki_pages = wiki_pages[
    wiki_pages['processed_text'].str.len() > min_wiki_length
]
corpus = wiki_pages["processed_text"].tolist()

In [8]:
# Start to encode the corpus with TF-IDF
X = vectorizer.fit_transform(corpus)

In [None]:
train = load_json("../data/public_train_new.jsonl")
train_df = pd.DataFrame(train)

# Perform the prediction for document retrieval
train_df["predictions"] = train_df["claim"].progress_apply(
    partial(
        get_pred_docs_sklearn,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        tf_idf_matrix=X,
        wiki_pages=wiki_pages,
        topk=topk,
    )
)

In [10]:
precision = calculate_precision(train, train_df["predictions"])
recall = calculate_recall(train, train_df["predictions"])

Precision: 0.14421572168051042
Recall: 0.13107419445447616


In [26]:
train_df["predictions"].to_pickle('../data/train_tfidf_doc1.pkl')

In [24]:
test = load_json("../data/public_test.jsonl")
test_df = pd.DataFrame(test)

test_df["predictions"] = test_df["claim"].progress_apply(
    partial(
        get_pred_docs_sklearn,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        tf_idf_matrix=X,
        wiki_pages=wiki_pages,
        topk=topk,
    )
)

100%|██████████| 989/989 [08:00<00:00,  2.06it/s]


In [27]:
test_df["predictions"].to_pickle('../data/test_tfidf_doc1.pkl')

In [31]:
private = load_json("../data/private_test_data.jsonl")
private_df = pd.DataFrame(private)

private_df["predictions"] = private_df["claim"].progress_apply(
    partial(
        get_pred_docs_sklearn,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        tf_idf_matrix=X,
        wiki_pages=wiki_pages,
        topk=topk,
    )
)

100%|██████████| 8049/8049 [1:05:58<00:00,  2.03it/s]


In [32]:
private_df["predictions"].to_pickle('../data/private_tfidf_doc1.pkl')

In [28]:
mix = pd.read_pickle('../data/train_tfidf_doc5.pkl')

for i in range(len(mix)):
    mix.iloc[i].update(train_df["predictions"].iloc[i])

mix.to_pickle('../data/train_tfidf_doc6_mix.pkl')

In [29]:
precision = calculate_precision(train, mix)
recall = calculate_recall(train, mix)

Precision: 0.701597046667467
Recall: 0.8595170739536935


In [30]:
test_mix = pd.read_pickle('../data/test_tfidf_doc5.pkl')

for i in range(len(test_mix)):
    test_mix.iloc[i].update(test_df["predictions"].iloc[i])

test_mix.to_pickle('../data/test_tfidf_doc6_mix.pkl')

In [34]:
private_mix = pd.read_pickle('../data/private_tfidf_doc5.pkl')

for i in range(len(private_mix)):
    private_mix.iloc[i].update(private_df["predictions"].iloc[i])

private_mix.to_pickle('../data/private_tfidf_doc6_mix.pkl')