In [None]:
!pip install vinorm underthesea

Collecting vinorm
  Downloading vinorm-2.0.7-py3-none-any.whl.metadata (2.1 kB)
Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading vinorm-2.0.7-py3-none-any.whl (40.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Code quét qua từng folder con trong datasets và thay đổi trên metadata csv của từng folder con

import os
import pandas as pd
import re
from vinorm import TTSnorm
from underthesea import word_tokenize

def text_cleaning(raw_text):
    # Normalize numbers and convert dates to text
    normalized_text = TTSnorm(raw_text)

    # Convert to lowercase
    normalized_text = normalized_text.lower()

    # Tokenize the text
    normalized_text = word_tokenize(normalized_text, format='text')

    # Remove punctuation and extra spaces around periods and question marks
    normalized_text = re.sub(r'\s*\.\s*', '.', normalized_text)
    normalized_text = re.sub(r'\s*\?\s*', '?', normalized_text)
    normalized_text = re.sub(r'[^\w\s?.]', '', normalized_text)  # Remove all punctuation except . and ?

    # Convert back to string format
    normalized_text = ' '.join(normalized_text.split())
    return normalized_text

def process_all_metadata_files(folder_path):
    # Go through each subfolder and check for 'metadata.csv'
    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)

        if os.path.isdir(subfolder_path):  # Ensure it's a directory
            file_path = os.path.join(subfolder_path, 'metadata.csv')

            if os.path.exists(file_path):  # Check if 'metadata.csv' exists in the folder
                print(f"Processing file: {file_path}")

                # Read the CSV without headers and specify column names
                df = pd.read_csv(file_path, header=None, sep='|', names=['audio_filename', 'raw_text'])

                # Apply text cleaning to 'raw_text' and replace the column
                df["raw_text"] = df["raw_text"].apply(text_cleaning)

                # Save the cleaned file, overwriting the original
                df.to_csv(file_path, header=False, sep='|', index=False, encoding='utf-8')
                print(f"Processed and saved: {file_path}")

# Example usage
folder_path = r'/content/drive/MyDrive/Bản sao datasets'  # Main directory containing subdirectories
process_all_metadata_files(folder_path)
print("Done")


Processing file: /content/drive/MyDrive/Bản sao datasets/TẤT TẦN TẬT về ĐẦU TƯ CƠ BẢN/metadata.csv
Processed and saved: /content/drive/MyDrive/Bản sao datasets/TẤT TẦN TẬT về ĐẦU TƯ CƠ BẢN/metadata.csv
Processing file: /content/drive/MyDrive/Bản sao datasets/Cách quản lý tiền bạc - Cơ bản/metadata.csv
Processed and saved: /content/drive/MyDrive/Bản sao datasets/Cách quản lý tiền bạc - Cơ bản/metadata.csv
Done
