In [15]:
import os
import re
import underthesea
from tqdm import tqdm

In [34]:
def normalize_vietnamese_text(text):
   
   text = text.lower().strip()
   text = re.sub(r"\s+", " ", text)  # Normalize multiple spaces

   # Keep only allowed characters (letters, numbers, punctuation)
   text = re.sub(r"[^a-zA-ZÀ-Ỹà-ỹ0-9,._!?;:/() ]+", "", text)

   # Fix punctuation spacing (ensure no space before, but one space after)
   text = re.sub(r"\s*,\s*", ", ", text)  # Comma: " , " → ", "
   text = re.sub(r"\s*;\s*", "; ", text)  # Semicolon: " ; " → "; "
   text = re.sub(r"\s*:\s*", ": ", text)  # Colon: " : " → ": "
   text = re.sub(r"\s*\.\s*", ". ", text)  # Period: " . " → ". "

   # Preserve numbered lists (ensure "2." remains part of the list, but not merge with previous sentence)
   text = re.sub(r"(?<=\w\.)\s+(\d+)\.", r" \1.", text)  # "X. 2." → "X. 2."

   # Ensure no extra spaces at the end
   text = text.strip()

   return text

def tokenize_vietnamese_text(text):
   
   tokens = underthesea.word_tokenize(text, format="text").split()  # Tokenize text
   return " ".join(tokens)  # Convert tokens back to text

def preprocess_vietnamese_folder(input_folder, output_folder):
   
   if not os.path.exists(output_folder):
      os.makedirs(output_folder)

   for filename in os.listdir(input_folder):
      if filename.endswith(".txt"):  
         input_path = os.path.join(input_folder, filename)
         output_path = os.path.join(output_folder, filename)

         
         with open(input_path, "r", encoding="utf-8") as file:
            raw_text = file.read()

         # Normalize text 
         cleaned_text = normalize_vietnamese_text(raw_text)

         # Tokenize text 
         processed_text = tokenize_vietnamese_text(cleaned_text)

         with open(output_path, "w", encoding="utf-8") as file:
            file.write(processed_text)

         print(f"Processed: {filename} → Saved to {output_path}")

In [35]:
input_folder = r"D:\ds_b3\NLP\xml_data\under2k"   
output_folder = r"D:\ds_b3\NLP\xml_data\under2k_cleaned"   

preprocess_vietnamese_folder(input_folder, output_folder)

Processed: 10015.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10015.txt
Processed: 10047.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10047.txt
Processed: 10106.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10106.txt
Processed: 10135.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10135.txt
Processed: 10185.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10185.txt
Processed: 10186.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10186.txt
Processed: 10317.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10317.txt
Processed: 10418.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10418.txt
Processed: 10461.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10461.txt
Processed: 10564.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10564.txt
Processed: 10573.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10573.txt
Processed: 10576.txt → Saved to D:\ds_b3\NLP\xml_data\under2k_cleaned\10576.txt
Processed: 10599.txt → Saved to D:\ds_b3

In [3]:
from underthesea import sent_tokenize, word_tokenize

In [30]:
def split_into_chunks(text, max_words=500):
   
   sentences = sent_tokenize(text)  
   chunks = []
   current_chunk = []
   current_length = 0

   for sentence in sentences:
      words = word_tokenize(sentence)  
      word_count = len(words)

      if current_length + word_count > max_words:
         # If adding this sentence exceeds max_words, save current chunk and start a new one
         if current_chunk:
            chunks.append(" ".join(current_chunk))
         current_chunk = [sentence]
         current_length = word_count
      else:
         # Otherwise, add sentence to the current chunk
         current_chunk.append(sentence)
         current_length += word_count

   if current_chunk:
      chunks.append(" ".join(current_chunk))

   return chunks

In [31]:
def chunk_and_token(input_folder, output_folder, max_words=300):

   if not os.path.exists(output_folder):
      os.makedirs(output_folder)

   file_list = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

   for filename in tqdm(file_list, desc="Processing files"):
      input_path = os.path.join(input_folder, filename)
      output_path = os.path.join(output_folder, filename)

      with open(input_path, "r", encoding="utf-8") as file:
         text = file.read()

      chunks = split_into_chunks(text, max_words)

      with open(output_path, "w", encoding="utf-8") as file:
         for i, chunk in enumerate(chunks):
            file.write(f"### Chunk {i+1} ###\n{chunk}\n\n")

   print(f"Processed {len(file_list)} files and saved to '{output_folder}'.")

In [36]:
input_folder = r"D:\ds_b3\NLP\xml_data\under2k_cleaned"  
output_folder = r"D:\ds_b3\NLP\xml_data\under2k_chunked"  
chunk_and_token(input_folder, output_folder)

Processing files: 100%|██████████| 2676/2676 [01:44<00:00, 25.55it/s]

Processed 2676 files and saved to 'D:\ds_b3\NLP\xml_data\under2k_chunked'.



