In [1]:
import glob

import pandas as pd
import torch
from loguru import logger
from transformers import AutoModelForCausalLM, BarthezTokenizer

from utils.compute_ppl import compute_ppl_for_documents
from utils.filter import filter_by_image, filter_by_lang
from utils.utils import get_current_datetime

In [2]:
current_datetime_suffix = get_current_datetime()

In [3]:
md_files = glob.glob("../documents/extract_result/*/*/*.md")
json_files = glob.glob("../documents/extract_result/*/*/*.json")

print(f"Num pdf: {len(md_files)}")
assert len(md_files) == len(json_files)

Num pdf: 3


In [4]:
remain = filter_by_image(json_files, logger)
remain = filter_by_lang(
    md_files=[p.replace("_meta.json", ".md") for p in remain], logger=logger
)

  0%|          | 0/3 [00:00<?, ?it/s]

[32m2025-10-06 19:43:43.452[0m | [1mINFO    [0m | [36mutils.filter[0m:[36mfilter_by_image[0m:[36m29[0m - [1mNum remain: 3:,[0m


  0%|          | 0/3 [00:00<?, ?it/s]

[32m2025-10-06 19:43:43.485[0m | [1mINFO    [0m | [36mutils.filter[0m:[36mfilter_by_lang[0m:[36m46[0m - [1mNum remain: 3[0m


In [5]:
pd.Series(remain).to_csv(
    f"../documents/filtered_documents/filter_by_rules_{current_datetime_suffix}.csv",
    index=False,
)

In [6]:
if torch.cuda.is_available():
    device = "cuda"
    logger.info(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    logger.info(
        f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB"
    )
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
    logger.info("Using Apple MPS")
else:
    device = "cpu"
    logger.info("Using CPU - you will need to use a GPU to train models")

[32m2025-10-06 19:43:43.521[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mUsing Apple MPS[0m


In [7]:
model_name = "airesearch/wangchanbart-base"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
tokenizer = BarthezTokenizer.from_pretrained(model_name)
max_length = model.config.max_position_embeddings
ppl_list = compute_ppl_for_documents(
    max_length,
    512,
    remain,
    model,
    tokenizer,
    logger,
    device,
)

  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
ppldf = pd.DataFrame(ppl_list, columns=["file_path", "ppl_score"])
ppldf.to_csv(
    f"../documents/ppl/perplexity_score_{current_datetime_suffix}.csv", index=False
)

In [9]:
filtered = ppldf.loc[ppldf["ppl_score"].lt(1500)].copy()
filtered["file_path"] = filtered["file_path"].str.replace(r"../", "", regex=False)
filtered.to_csv(
    f"../documents/filtered_documents/filter_by_ppl_{current_datetime_suffix}.csv",
    index=False,
)