In [11]:
from underthesea import sent_tokenize
import os
from tqdm import tqdm

def split_text_into_chunks(text, chunk_size=100, window_size=50):
    """Split a long text into multiple chunks (passages) with managable sizes.
    
    Args:
        chunk_size (int): Maximum size of a chunk.
        window_size (int): Decide how many words are overlapped between two consecutive chunks. Basically #overlapped_words = chunk_size - window_size.
    Returns:
        str: Multiple chunks of text splitted from initial document text.
    """
    words = text.split()
    num_words = len(words)
    chunks = []
    start_idx = 0

    while True:
        end_idx = start_idx + chunk_size
        chunk = " ".join(words[start_idx:end_idx])
        chunks.append(chunk)
        if end_idx >= num_words:
            break
        start_idx += window_size

    return chunks

def get_corpus(data_dir="data/hainong_raw/"):
    """Transform a corpus of documents into a corpus of passages.
    
    Args:
        data_dir (str): directory that contains .txt files, each file contains text content of a wikipedia page.
    Returns:
        str: A corpus of chunks splitted from multiple initial documents. Each chunk will contain information about (id, title, passage)
    """
    corpus = []
    meta_corpus = []
    data_dir = "data/hainong_raw/"
    filenames = os.listdir(data_dir)
    filenames = sorted(filenames)
    
    _id = 0
    docs = {}
    for filename in tqdm(filenames):
        filepath = data_dir + filename
        title = filename.strip(".md")
        with open(filepath, "r") as f:
            text = f.read()
            docs[title] = text
            text = text.lstrip(title).strip()

            # No overlap.
            chunks = split_text_into_chunks(text, chunk_size=150, window_size=100)
            chunks = [f"{chunk}" for chunk in chunks]
            meta_chunks = [{
                "title": title,
                "passage": chunks[i],
                "id": _id + i,
                "len": len(chunks[i].split())
            } for i in range(len(chunks))]
            _id += len(chunks)
            corpus.extend(chunks)
            meta_corpus.extend(meta_chunks)
    return meta_corpus

In [12]:
meta_corpus = get_corpus("../../app/api/data/training_data/")
print(f">>> Corpus size: {len(meta_corpus)}")
print(f">>> Example passage")


100%|██████████| 52/52 [00:00<00:00, 164.63it/s]

>>> Corpus size: 184
>>> Example passage





In [17]:
import json
chunk_size = 150
chunk_overlap = 50
with open(f"data/corpus_chunks_{chunk_size}_{chunk_overlap}.jsonl", "w") as outfile:
    for chunk in meta_corpus:
        d = json.dumps(chunk, ensure_ascii=False) + "\n"
        outfile.write(d)

In [15]:
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize
from tqdm import tqdm 
import numpy as np
# model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base')

segmented_corpus = [tokenize(example["passage"]) for example in tqdm(meta_corpus)]
embeddings_output = model.encode(segmented_corpus)
embeddings = embeddings_output/(np.linalg.norm(embeddings_output, axis=1)[:, np.newaxis])

No sentence-transformers model found with name VoVanPhuc/sup-SimCSE-VietNamese-phobert-base. Creating a new one with MEAN pooling.


100%|██████████| 184/184 [00:00<00:00, 330.22it/s]


In [16]:
chunk_size
chunk_overlap
import pickle
with open(f'data/corpus_embedding_w150_{chunk_size}_{chunk_overlap}.pkl', 'wb') as f:
    pickle.dump(embeddings, f)