In [23]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torchcrf import CRF

import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

from shared_functions.gg_sheet_drive import *

In [51]:
import re

def process_ner_file(input_path):
    vowel_chars = "aăâeêioôơuưyáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ"

    # --- Token-level preprocessing functions ---
    def merge_fragmented_vietnamese_tokens(tokens, labels):
        """Merge tokens starting with lowercase vowel into previous token; keep labels aligned."""
        out_tokens, out_labels = [], []
        for tok, lab in zip(tokens, labels):
            if out_tokens and tok and tok[0].lower() in vowel_chars and not tok[0].isupper():
                out_tokens[-1] += tok  # merge text
                # keep previous label (do not change)
            else:
                out_tokens.append(tok)
                out_labels.append(lab)
        return out_tokens, out_labels

    def normalize_slash_tokens(tokens, labels):
        """
        Remove spaces around slashes by merging pattern: token / token -> token/token
        Keep label of first token in the merged token.
        """
        i = 0
        out_tokens, out_labels = [], []
        while i < len(tokens):
            tok = tokens[i].strip()
            lab = labels[i]

            # Pattern: token / token -> merge three tokens
            if i + 2 < len(tokens) and tokens[i + 1] == "/" :
                merged = tok + "/" + tokens[i + 2].strip()
                out_tokens.append(merged)
                out_labels.append(lab)
                i += 3
                continue

            # Pattern: single slash (rare)
            if tok == "/" and out_tokens and i + 1 < len(tokens):
                merged = out_tokens.pop() + "/" + tokens[i + 1].strip()
                merged_label = out_labels.pop()
                out_tokens.append(merged)
                out_labels.append(merged_label)
                i += 2
                continue

            out_tokens.append(tok)
            out_labels.append(lab)
            i += 1
        return out_tokens, out_labels

    def remove_edge_punctuation(tokens, labels):
        """Remove leading/trailing pure punctuation tokens; preserve label alignment."""
        while tokens and re.fullmatch(r"[\-.,;:!?]+", tokens[0]):
            tokens.pop(0)
            labels.pop(0)
        while tokens and re.fullmatch(r"[\-.,;:!?]+", tokens[-1]):
            tokens.pop(-1)
            labels.pop(-1)
        return tokens, labels

    def starts_with_lowercase_word(s):
        """Check if first alphabetic character is lowercase."""
        m = re.search(r"[A-Za-zÀ-ỹ]", s)
        return bool(m and m.group(0).islower())

    def normalize_slash_in_text(text):
        """Remove spaces around slashes globally in sentence string."""
        return re.sub(r"\s*/\s*", "/", text)

    # --- Main file processing ---
    all_sentences = []
    cur_tokens, cur_labels = [], []

    with open(input_path, encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n")
            if not line.strip():
                if cur_tokens:
                    # Preprocess token-level lists
                    toks, labs = merge_fragmented_vietnamese_tokens(cur_tokens, cur_labels)
                    toks, labs = normalize_slash_tokens(toks, labs)
                    toks, labs = remove_edge_punctuation(toks, labs)

                    # Join tokens and labels as sentence-level strings
                    sentence_str = " ".join(toks)
                    sentence_str = normalize_slash_in_text(sentence_str)
                    label_str = " ".join(labs)

                    all_sentences.append((sentence_str, label_str))

                cur_tokens, cur_labels = [], []
                continue

            parts = line.split()
            if len(parts) < 2:
                continue

            token = parts[0]
            label = parts[-1]

            cur_tokens.append(token)
            cur_labels.append(label)

        # Flush last sentence
        if cur_tokens:
            toks, labs = merge_fragmented_vietnamese_tokens(cur_tokens, cur_labels)
            toks, labs = normalize_slash_tokens(toks, labs)
            toks, labs = remove_edge_punctuation(toks, labs)
            sentence_str = " ".join(toks)
            sentence_str = normalize_slash_in_text(sentence_str)
            label_str = " ".join(labs)
            all_sentences.append((sentence_str, label_str))

    # --- Optional postprocessing: merge short sentences or lowercase-starting sentences ---
    processed = []
    i = 0
    while i < len(all_sentences):
        sent, labs = all_sentences[i]

        # Merge short sentences (<5 tokens) with next
        if len(sent.split()) < 5 and i + 1 < len(all_sentences):
            next_sent, next_labs = all_sentences[i + 1]
            merged_sent = normalize_slash_in_text((sent + " " + next_sent).strip())
            merged_labs = (labs + " " + next_labs).strip()
            processed.append((merged_sent, merged_labs))
            i += 2
            continue

        # Merge if starts with lowercase
        if processed and starts_with_lowercase_word(sent):
            prev_sent, prev_labs = processed.pop()
            merged_sent = normalize_slash_in_text((prev_sent + " " + sent).strip())
            merged_labs = (prev_labs + " " + labs).strip()
            processed.append((merged_sent, merged_labs))
            i += 1
            continue

        # Keep sentence as-is
        processed.append((normalize_slash_in_text(sent).strip(), labs.strip()))
        i += 1

    return processed


In [57]:
df = pd.DataFrame(columns=['input_text', 'label'])

results = process_ner_file('D:/Study/Education/Projects/Group_Project/rag_model/model/NER/artifact/test.conll')

for result in results:
    df_temp = pd.DataFrame({'input_text': [result[0]], 'label': [result[1]]})
    df = pd.concat([df, df_temp], axis=0, ignore_index=True)

In [58]:
df

Unnamed: 0,input_text,label
0,-DOCSTART- CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM,O B-LOC I-LOC I-LOC I-LOC I-LOC I-LOC I-LOC I-LOC
1,LUẬT SỐ 14/2023/QH15 BAN HÀNH NGÀY 15 THÁNG 6 ...,B-TYP O I-DOCID O O B-DAT I-DAT I-DAT I-DAT I-...
2,CHỦ TỊCH NƯỚC NGUYỄN XUÂN PHÚC KÝ,B-PER I-PER I-PER B-PER I-PER I-PER O
3,ĐIỀU 1 PHẠM VI ĐIỀU CHỈNH SỬA LUẬT GIÁO DỤC,O O O O O O O B-TIT I-TIT I-TIT
4,CƠ QUAN BỘ GIÁO DỤC VÀ ĐÀO TẠO THỰC HIỆN,B-DEP I-DEP I-DEP I-DEP I-DEP I-DEP I-DEP I-DE...
5,ĐIỀU 2 NGÀY BAN HÀNH LUẬT NÀY LÀ 01/01/2024,O O B-DAT O O B-TYP O O B-DAT
6,"Luật số : 76/2025/QH15 Hà Nội , ngày 17 tháng ...",O O O B-DOCID B-LOC I-LOC O B-DAT I-DAT I-DAT ...
7,"LUẬT SỬA ĐỔI , BỔ SUNG MỘT SỐ ĐIỀU CỦA LUẬT DO...",B-TYP O O O O O O O O O B-TIT I-TIT I-TIT
8,Căn cứ Hiến pháp nước Cộng hòa xã hội chủ nghĩ...,O O B-TIT I-TIT I-TIT I-TIT I-TIT I-TIT I-TIT ...
9,"Quốc hội ban hành Luật sửa đổi , bổ sung một s...",B-DEP I-DEP O O B-TIT I-TIT I-TIT I-TIT I-TIT ...


In [54]:
import json

with open('D:/Study/Education/Projects/Group_Project/rag_model/model/NER/artifact/label2idx.json', 'r', encoding='utf-8') as f:
    label_dict = json.load(f)


In [59]:
df['label_idx'] = df['label'].apply(
    lambda x: ' '.join(str(label_dict[label]) for label in x.split())
)

In [60]:
write_df_to_gs(df, 'testing_ner')

'New tab created and DataFrame written to Google Sheet: testing_ner'