In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from collections import defaultdict
import pickle
import os
import json
from tqdm import tqdm
import nltk
from konlpy.tag import Okt
import joblib
import string
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.util import ngrams
import gc
from joblib import dump
from joblib import load
from multiprocessing import Pool, cpu_count


nltk.download('punkt')
nltk.download('stopwords')
# Download nltk wordnet in /usr/lib/nltk_data
nltk.download('wordnet')

okt = Okt()
lemmatizer = WordNetLemmatizer()
stemmer_dict = {
    'fr': SnowballStemmer('french'),
    'de': SnowballStemmer('german'),
    'es': SnowballStemmer('spanish'),
    'it': SnowballStemmer('italian'),
    'en': SnowballStemmer('english')
}

def load_stopwords(languages=['english', 'french', 'german', 'spanish', 'italian']):
    stop_words = set()
    for lang in languages:
        stop_words.update(nltk.corpus.stopwords.words(lang))
    return stop_words

stop_words = load_stopwords(['english', 'french', 'german', 'spanish', 'italian'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# tokenize and 2-grams on 'fr', 'de', 'es', 'it'
def preprocess_text(text, lang):
    if not isinstance(text, str):
        text = ""
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    if lang in ['en', 'fr', 'de', 'es', 'it']:
        tokens = nltk.word_tokenize(text)
    elif lang == 'ko':
        tokens = okt.morphs(text)
    else:
        tokens = text.split()
    
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    if lang == 'en':
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif lang in ['fr', 'de', 'es', 'it']:
        stemmer = stemmer_dict.get(lang, None)
        if stemmer:
            tokens = [stemmer.stem(word) for word in tokens]
    
    if lang in ['fr', 'de', 'es', 'it'] and len(tokens) >= 2:
        n_grams = ['_'.join(gram) for gram in ngrams(tokens, 2)]
        tokens = tokens + n_grams
    
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

def preprocess_batch(batch):
    texts, langs = batch
    return [preprocess_text(text, lang) for text, lang in zip(texts, langs)]

def preprocess_texts(texts, langs, batch_size=10000):
    batches = list(split_into_batches(list(zip(texts, langs)), batch_size))
    
    with Pool(cpu_count()) as pool:
        results = list(tqdm(pool.imap(preprocess_batch, batches), total=len(batches), desc="Preprocess texts"))
    
    # Flatten the list of lists
    preprocessed_texts = [item for sublist in results for item in sublist]
    return preprocessed_texts

In [3]:
# load and save
def load_corpus(corpus_path='corpus.json'):
    with open(corpus_path, 'r', encoding='utf-8') as f:
        corpus = json.load(f)
    corpus_df = pd.DataFrame(corpus)
    return corpus_df

def load_data(train_path='train.csv', test_path='test.csv'):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [4]:
# algorithm
class BM25:
    def __init__(self, tokenized_corpus, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.corpus_size = len(tokenized_corpus)
        self.avgdl = sum(len(doc) for doc in tokenized_corpus) / self.corpus_size
        self.df = defaultdict(int)
        self.idf = {}
        self.inverted_index = defaultdict(list)
        self.term_freqs = []
        self.build(tokenized_corpus)
    
    def build(self, tokenized_corpus):
        for doc_id, document in enumerate(tqdm(tokenized_corpus, desc="Creating BM25 index")):
            freq = defaultdict(int)
            for word in document:
                freq[word] += 1
            self.term_freqs.append(freq)
            for word in freq.keys():
                self.df[word] += 1
                self.inverted_index[word].append(doc_id)
        
        for word, freq in self.df.items():
            self.idf[word] = math.log(1 + (self.corpus_size - freq + 0.5) / (freq + 0.5))
    
    def get_scores(self, query):
        scores = np.zeros(self.corpus_size)
        unique_query_terms = set(query)
        for word in unique_query_terms:
            if word not in self.idf:
                continue
            idf = self.idf[word]
            doc_ids = self.inverted_index[word]
            for doc_id in doc_ids:
                tf = self.term_freqs[doc_id][word]
                dl = sum(self.term_freqs[doc_id].values())
                score = idf * ((tf * (self.k1 + 1)) / (tf + self.k1 * (1 - self.b + dl / self.avgdl)))
                scores[doc_id] += score
        return scores
    
    def retrieve_top_n(self, query, n=10):
        scores = self.get_scores(query)
        if n >= len(scores):
            top_n_indices = np.argsort(scores)[::-1]
        else:
            top_n_indices = np.argpartition(scores, -n)[-n:]
            top_n_indices = top_n_indices[np.argsort(scores[top_n_indices])[::-1]]
        return top_n_indices

In [None]:
corpus_df = load_corpus('./data/corpus.json/corpus.json')
train_df, test_df = load_data('./data/train.csv', './data/test.csv')

# split
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)
print(f"Train size: {len(train_data)}, Val size: {len(val_data)}")

# generate or load preprocessed corpus
preprocessed_corpus_path = './preprocessed_corpus.pkl'
if os.path.exists(preprocessed_corpus_path):
    print(f"Loading exist preprocessed corpus: {preprocessed_corpus_path}")
    preprocessed_corpus = load_pickle(preprocessed_corpus_path)
else:
    print(f"Preprocessed corpus not found, generating ...")
    document_texts = corpus_df['text'].tolist()
    document_langs = corpus_df['lang'].tolist()
    preprocessed_corpus = preprocess_texts(document_texts, document_langs)
    save_pickle(preprocessed_corpus, preprocessed_corpus_path)
    print(f"Preprocessed corpus saved '{preprocessed_corpus_path}'。")

# process each lang
lang_to_doc_indices = defaultdict(list)
document_langs = corpus_df['lang'].tolist()
document_ids = corpus_df['docid'].tolist()
for idx, lang in enumerate(document_langs):
    lang_to_doc_indices[lang].append(idx)

bm25_models = {}
doc_id_maps = {}

for lang, indices in tqdm(lang_to_doc_indices.items(), desc="Processing languages"):
    print(f"Processing '{lang}' ...")
    lang_texts = [preprocessed_corpus[idx] for idx in indices]
    lang_docids = [document_ids[idx] for idx in indices]
    
    # token
    tokenized_lang_path = f'./preprocessed_data/tokenized_{lang}.pkl'
    if os.path.exists(tokenized_lang_path):
        print(f"token {tokenized_lang_path} already exit，skip generation")
        lang_tokenized_corpus = load_pickle(tokenized_lang_path)
    else:
        print(f"Generating tokens for '{lang}' ...")
        if lang == 'ko':
            lang_tokenized_corpus = [okt.morphs(text) for text in tqdm(lang_texts, desc=f"tokenizaion {lang}")]
        else:
            lang_tokenized_corpus = [text.split() for text in lang_texts]
        save_pickle(lang_tokenized_corpus, tokenized_lang_path)
        print(f"Tokens saved to '{tokenized_lang_path}'。")
    
    # model
    bm25_model_path = f'bm25_model_{lang}.joblib'
    if os.path.exists(bm25_model_path):
        print(f"BM25 model '{bm25_model_path}' already exist，skip generation")
        bm25_model = joblib.load(bm25_model_path)
    else:
        print(f"Generating BM25 model for  '{lang}' ...")
        bm25_model = BM25(lang_tokenized_corpus, k1=1.5, b=0.75)
        dump(model, f"preprocessed_data/bm25_model_{lang}.joblib", compress=3)
        print(f"BM25 model saved at '{bm25_model_path}'。")
    
    bm25_models[lang] = bm25_model
    doc_id_maps[lang] = lang_docids
    
    del lang_tokenized_corpus, bm25_model
    gc.collect()

Train size: 19687, Val size: 2188
Preprocessed corpus not found, generating ...


Preprocess texts:   0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
def evaluate_recall_at_k(bm25_models, doc_id_maps, val_data, k=10):
    recall_count = 0
    total = len(val_data)
    for idx, row in val_data.iterrows():
        query = row['query']
        lang = row['lang']
        pos_doc = row['positive_docs']
        if lang not in bm25_models:
            continue
        bm25_model = bm25_models[lang]
        doc_ids = doc_id_maps[lang]
        
        query_clean = preprocess_text(query, lang)
        if lang == 'ko':
            tokenized_query = okt.morphs(query_clean)
        else:
            tokenized_query = query_clean.split()
        
        top_n_indices = bm25_model.retrieve_top_n(tokenized_query, n=k)
        retrieved_doc_ids = [doc_ids[i] for i in top_n_indices]
        
        if pos_doc in retrieved_doc_ids:
            recall_count += 1
    
    recall = recall_count / total if total > 0 else 0
    return recall

print("Computing Val Recall@10...")
recall_at_10 = evaluate_recall_at_k(bm25_models, doc_id_maps, val_data, k=10)
print(f"Val Recall@10: {recall_at_10:.4f}")

In [None]:
def retrieve_test_queries(bm25_models, doc_id_maps, test_df, k=10):
    retrieved_docs = []
    for idx, row in test_df.iterrows():
        query_id = row['query_id']
        query = row['query']
        lang = row['lang']
        if lang not in bm25_models:
            retrieved_docs.append([])
            continue
        bm25_model = bm25_models[lang]
        doc_ids = doc_id_maps[lang]
        
        query_clean = preprocess_text(query, lang)
        if lang == 'ko':
            tokenized_query = okt.morphs(query_clean)
        else:
            tokenized_query = query_clean.split()
        
        top_n_indices = bm25_model.retrieve_top_n(tokenized_query, n=k)
        retrieved_doc_ids = [doc_ids[i] for i in top_n_indices]
        retrieved_docs.append(retrieved_doc_ids)
    return retrieved_docs

print("Retrive on test queries...")
retrieved_docs_test = retrieve_test_queries(bm25_models, doc_id_maps, test_df, k=10)

In [None]:
submission_df = pd.DataFrame({
    'id': np.arange(len(test_df)),
    'docids': retrieved_docs_test
})
submission_df.to_csv('submission.csv', index=False)
print("'submission.csv' done")