In [None]:
!pip install underthesea==6.0.0


Collecting underthesea==6.0.0
  Downloading underthesea-6.0.0-py3-none-any.whl.metadata (10 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea==6.0.0)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==0.0.5a2 (from underthesea==6.0.0)
  Downloading underthesea_core-0.0.5_alpha.2-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.1 kB)
Downloading underthesea-6.0.0-py3-none-any.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-0.0.5_alpha.2-cp311-cp311-manylinux2010_x86_64.whl (599 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.3/599.3 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/

In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata
import os
import csv
from underthesea import word_tokenize


In [None]:
# --- VIETNAMESE STOPWORDS ---
VIETNAMESE_STOP_WORDS = set([
    "và", "hoặc", "là", "có", "của", "trong", "theo", "này", "đây", "với", "cho", "mà", "được",
    "cùng", "bởi", "từ", "nếu", "cũng", "sẽ", "khi", "không", "để", "đi", "vì", "mới", "cả",
    "hơn", "nhiều", "ít", "thì", "như", "các", "vào", "bằng", "ra", "lên", "xuống", "qua", "lại",
    "anh", "em", "chị", "bạn", "tôi", "mình", "nó", "họ", "chúng ta", "chúng tôi", "chúng nó",
    "ai", "gì", "đâu", "nào", "sao", "bao nhiêu", "lúc nào", "tại sao", "ở", "tại", "trên",
    "dưới", "trước", "sau", "ấy", "những", "một", "hai", "ba", "vài", "rằng", "ạ", "à", "ừ",
    "dạ", "vâng", "ơi", "nhỉ", "nhé", "nha", "đó", "đây", "kia", "ấy"
])

# --- TEENCODE DICTIONARY ---
TEENCODE_MAP = {
    "k": "không", "ko": "không", "khum": "không", "hok": "không", "hem": "không", "hong": "không",
    "j": "gì", "g": "gì", "z": "gì", "zậy": "vậy", "zay": "vậy", "v": "vậy", "zô": "vào", "zo": "vào",
    "r": "rồi", "roi": "rồi", "wá": "quá", "wa": "quá", "iu": "yêu", "luv": "yêu",
    "thks": "cảm ơn", "tks": "cảm ơn", "thanks": "cảm ơn", "ty": "cảm ơn",
    "ok": "được", "oke": "được", "oki": "được", "okie": "được", "dc": "được", "đc": "được",
    "vl": "rất", "vkl": "rất", "vcl": "rất", "vch": "rất", "vs": "với", "mn": "mọi người",
    "bik": "biết", "bjt": "biết", "bit": "biết", "bb": "tạm biệt", "bye": "tạm biệt",
    "h": "giờ", "hjo": "giờ", "ng": "người", "nguoi": "người", "ntn": "như thế nào",
    "a": "anh", "e": "em", "ib": "nhắn tin", "inbox": "nhắn tin", "s": "sao",
    "đk": "được không", "dk": "được không", "t": "tôi", "b": "bạn", "m": "mày",
}

# Compile regex for efficiency
teencode_pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in TEENCODE_MAP.keys()) + r')\b', re.IGNORECASE)

print("Stopwords and teencode dictionaries are loaded.")

Stopwords and teencode dictionaries are loaded.


In [None]:
def normalize_teencode(text: str) -> str:
    """Standardizes teencode words to their proper form."""
    return teencode_pattern.sub(lambda m: TEENCODE_MAP[m.group(0).lower()], text)

def has_vietnamese_diacritics(text: str) -> bool:
    return re.search(r'[áàảãạăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ]', text, re.IGNORECASE) is not None

def preprocess_pipeline(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    s = text.lower()
    # Normalize teencode
    s = normalize_teencode(s)
    # Remove HTML tags and URLs
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'http\S+|www.\S+', ' ', s)
    # Normalize Unicode
    s = unicodedata.normalize("NFC", s)
    # Remove special characters, keeping letters, numbers, whitespace, and standard punctuation
    vietnamese_chars = 'áàảãạăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ'
    allowed_chars = r'[^a-z0-9' + vietnamese_chars + r'\s.,?!:;\'"]'
    s = re.sub(allowed_chars, ' ', s)
    # Remove extra whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    # Remove any leading/trailing punctuation and whitespace
    punctuation_to_trim = r'.,?!:;\'"\s'
    s = re.sub(r'^[' + punctuation_to_trim + r']+|[' + punctuation_to_trim + r']+$', '', s)

    # Tokenization
    tokens = word_tokenize(s)
    s = " ".join(tokens) # Join tokens back into a string

    return s




In [None]:
# --- CONFIGURATION ---
INPUT_EXCEL_FILE = '/content/data_preprocessed4.xlsx' # Path to your input Excel file
OUTPUT_EXCEL_FILE = '/content/data_preprocessed5.xlsx' # Path for the output Excel file
TEXT_COLUMN = 'Sentence' # Name of the column containing text
LABEL_COLUMN = 'Emotion' # Name of the column containing labels

# --- PROCESSING SCRIPT ---
try:
    print(f"Reading Excel file from: {INPUT_EXCEL_FILE}")
    df = pd.read_excel(INPUT_EXCEL_FILE, engine='openpyxl')
    print(f"File read successfully. Initial rows: {len(df)}")


    df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN], inplace=True)
    df.drop_duplicates(subset=[TEXT_COLUMN], inplace=True, keep='first')
    df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str)
    initial_rows = len(df)
    df = df[df[TEXT_COLUMN].apply(has_vietnamese_diacritics)].copy()
    rows_removed = initial_rows - len(df)
    print(f"Data cleaned. Removed {rows_removed} sentences without diacritics. Remaining rows: {len(df)}")

    print("\nApplying preprocessing pipeline...")
    df['processed_text'] = df[TEXT_COLUMN].apply(preprocess_pipeline)

    # --- STAGE 3: FINAL CLEANUP AND EXPORT ---
    df.replace('', np.nan, inplace=True)
    df.dropna(subset=['processed_text'], inplace=True)
    print(f"Rows after final cleaning: {len(df)}")

    label_map = {
        "buồn bã": "buồn bã", "buon ba": "buồn bã",
        "tức giận": "tức giận", "tuc gian": "tức giận",
        "vui vẻ": "vui vẻ", "vui ve": "vui vẻ",
        "sợ hãi": "sợ hãi", "so hai": "sợ hãi",
        "ngạc nhiên": "ngạc nhiên", "ngac nhien": "ngạc nhiên",
        "ghê tởm": "ghê tởm", "ghe tom": "ghê tởm",
    }
    df[LABEL_COLUMN] = df[LABEL_COLUMN].astype(str).str.strip().str.lower().replace(label_map)

    final_df = df[['processed_text', LABEL_COLUMN]]
    final_df.columns = [TEXT_COLUMN, LABEL_COLUMN]

    final_df.to_excel(
        OUTPUT_EXCEL_FILE,
        index=False
    )

    print("\n--- PREPROCESSING COMPLETE ---")
    print(f"Processed data saved to Excel file: {OUTPUT_EXCEL_FILE}")
    print(f"Final dataset contains {len(final_df)} rows.")

except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_EXCEL_FILE}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Reading Excel file from: /content/data_preprocessed4.xlsx
File read successfully. Initial rows: 36965
Data cleaned. Removed 0 sentences without diacritics. Remaining rows: 33616

Applying preprocessing pipeline...
Rows after final cleaning: 33616

--- PREPROCESSING COMPLETE ---
Processed data saved to Excel file: /content/data_preprocessed5.xlsx
Final dataset contains 33616 rows.
