In [3]:
import logging
import re
import os
from typing import Any, Dict, List, Tuple
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ConfigurationError, InvalidName
from sklearn.model_selection import train_test_split


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def fetch_data_from_mongo(mongo_uri: str, collection: str, db_name: str):
    logging.info(f"Input params: mongo_uri={mongo_uri}, db={db_name}, collection={collection}")
    try:
        client = MongoClient(mongo_uri)
        logging.info("MongoClient created successfully.")

        db = client[db_name]
        logging.info(f"Database '{db_name}' accessed successfully.")
        
        col = db[collection]
        logging.info(f"Collection '{collection}' ready to use.")

        return col

    except ConfigurationError as e:
        logging.error(f"MongoDB Configuration Error: {e}")
    except ConnectionFailure as e:
        logging.error(f"Cannot connect to MongoDB server: {e}")
    except InvalidName as e:
        logging.error(f"Invalid database or collection name: {e}")
    except Exception as e:
        logging.error(f"Unexpected error occurred: {e}")

    logging.warning("Returning None due to previous errors.")
    return None

def convert_to_bio(tasks):
    bio_samples = []
    for task in tasks:
        text = task.get("data", {}).get("text")
        if not text:
            continue
        spans = _get_spans(task)
        tokens_with_tags = _find_bio_in_text(text, spans)
    
        if tokens_with_tags:
            bio_samples.append(tokens_with_tags)
            
    return bio_samples        
    
        

def _get_spans(task):
    annotation = task.get("annotations")[0].get('result')
    spans = []
    for anno in annotation:
        start = anno.get('value').get("start")
        end = anno.get('value').get("end")
        label = anno.get('value').get("labels") 
        if 'O' not in label:
            spans.append((start,end,label))
            
    return spans

def _find_bio_in_text(text, spans):
    tokens_w_tags = []
    for match in re.finditer(r"[^,;\s.]+", text):
        token = match.group()
        start_tok = match.start()
        end_tok = match.end()
        
        tag = 'O'
        
        for start, end, label in spans:
            if start_tok >= start and end_tok <= end+1:
                if start_tok== start:
                    tag = f'B-{label[0]}'
                else: 
                    tag = f'I-{label[0]}'
                continue
        
        tokens_w_tags.append((token, tag))
        
    return tokens_w_tags
        
    

def split_train_valid(
    bio_samples: List[Dict[str, Any]], 
    valid_ratio: float = 0.1,
    seed: int = 42
    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    if not bio_samples:
        return [], []
    
    train_samples, valid_samples = train_test_split(
        bio_samples,
        test_size=valid_ratio,
        random_state=seed,
        shuffle=True
    )
    
    return train_samples, valid_samples

In [6]:

col = fetch_data_from_mongo(
    mongo_uri="mongodb+srv://admin:admin123@label-studio.w0lrldm.mongodb.net/?appName=label-studio",
    db_name="label-studio",
    collection="ner-labeled-output",
)

cursor= col.find({})
tasks = cursor.to_list()

bio_samples = convert_to_bio(tasks)
train_set, valid_set = split_train_valid(bio_samples, valid_ratio=0.1)


2025-12-10 21:32:13,926 - INFO - Input params: mongo_uri=mongodb+srv://admin:admin123@label-studio.w0lrldm.mongodb.net/?appName=label-studio, db=label-studio, collection=ner-labeled-output
2025-12-10 21:32:13,993 - INFO - MongoClient created successfully.
2025-12-10 21:32:13,994 - INFO - Database 'label-studio' accessed successfully.
2025-12-10 21:32:13,994 - INFO - Collection 'ner-labeled-output' ready to use.


In [None]:
import json
from pathlib import Path
from typing import List, Tuple, Dict, Any


def convert_corpus_to_jsonl(
    corpus: List[List[Tuple[str, str]]],
    jsonl_path: str,
    labels_path: str | None = None,
    drop_empty: bool = True,
) -> None:
    """
    Convert a corpus dạng:
        corpus = [
            [("Token1", "LABEL1"), ("Token2", "LABEL2"), ...],
            [("TokenA", "LABELX"), ("TokenB", "LABELY"), ...],
            ...
        ]

    → ghi ra JSONL:
        {"tokens": [...], "labels": [...]}

    Option: lưu luôn labels vào file labels_path (vd: labels.json)
    """

    jsonl_file = Path(jsonl_path)
    jsonl_file.parent.mkdir(parents=True, exist_ok=True)

    all_labels: set[str] = set()
    num_samples = 0

    with jsonl_file.open("w", encoding="utf-8") as f_out:
        for sent_idx, sentence in enumerate(corpus):

            if not sentence:
                if drop_empty:
                    continue
                else:
                    # nếu không muốn drop, vẫn ghi sample rỗng
                    entry = {"tokens": [], "labels": []}
                    f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
                    num_samples += 1
                    continue

            tokens = []
            labels = []

            for token, label in sentence:
                tokens.append(token)
                labels.append(label)
                all_labels.add(label)

            # sanity check: độ dài phải bằng nhau
            if len(tokens) != len(labels):
                raise ValueError(
                    f"Sentence {sent_idx} có số token != số label "
                    f"({len(tokens)} vs {len(labels)})"
                )

            entry: Dict[str, Any] = {
                "tokens": tokens,
                "labels": labels,
            }

            f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
            num_samples += 1

    print(f"✅ Đã ghi {num_samples} câu vào: {jsonl_file}")

    # nếu muốn lưu mapping label
    if labels_path is not None:
        labels_file = Path(labels_path)
        labels_file.parent.mkdir(parents=True, exist_ok=True)

        sorted_labels = sorted(all_labels)
        label2id = {label: idx for idx, label in enumerate(sorted_labels)}

        with labels_file.open("w", encoding="utf-8") as f_lb:
            json.dump(
                {
                    "labels": sorted_labels,
                    "label2id": label2id,
                },
                f_lb,
                ensure_ascii=False,
                indent=2,
            )

        print(f"✅ Đã lưu labels vào: {labels_file}")
        print("   Nhãn:", sorted_labels)


# ==========================
# VÍ DỤ SỬ DỤNG
# ==========================
if __name__ == "__main__":
    # Ví dụ: bạn có 1 corpus như này (thay bằng dữ liệu thật của bạn)
    corpus = [
        [
            ("Sau", "O"),
            ("1", "O"),
            ("thời", "O"),
            ("gian", "O"),
            ("sử", "O"),
            ("dụng", "O"),
            ("mờ", "B-BENEFITS"),
            ("thâm", "I-BENEFITS"),
        ],
        [
            ("Vitamin", "B-NAME"),
            ("C", "I-NAME"),
            ("có", "O"),
            ("tác", "B-BENEFITS"),
            ("dụng", "I-BENEFITS"),
        ],
        # ... thêm nhiều câu nữa
    ]

    # Ghi ra file JSONL + labels
    convert_corpus_to_jsonl(
        corpus=corpus,
        jsonl_path="output/ner_dataset.jsonl",
        labels_path="output/labels.json",
        drop_empty=True,
    )


In [8]:
for item in train_set:
    print(item)

[('Sau', 'O'), ('1', 'O'), ('thời', 'O'), ('gian', 'O'), ('sử', 'O'), ('dụng', 'O'), ('(từ', 'O'), ('8', 'O'), ('–', 'O'), ('12', 'O'), ('tuần)', 'O'), ('làn', 'O'), ('da', 'O'), ('sẽ', 'O'), ('nhận', 'O'), ('được', 'O'), ('các', 'O'), ('thay', 'O'), ('đổi', 'O'), ('tích', 'O'), ('cực', 'O'), ('như', 'O'), ('mờ', 'B-BENEFITS'), ('thâm', 'I-BENEFITS'), ('tàn', 'O'), ('nhang', 'O'), ('tăng', 'B-BENEFITS'), ('độ', 'I-BENEFITS'), ('đàn', 'I-BENEFITS'), ('hồi', 'I-BENEFITS'), ('mờ', 'B-BENEFITS'), ('nếp', 'I-BENEFITS'), ('nhăn', 'I-BENEFITS'), ('cũng', 'O'), ('như', 'O'), ('cải', 'B-BENEFITS'), ('thiện', 'I-BENEFITS'), ('kết', 'I-BENEFITS'), ('cấu', 'I-BENEFITS'), ('da', 'I-BENEFITS'), ('tổng', 'I-BENEFITS'), ('thể', 'I-BENEFITS'), ('làn', 'O'), ('da', 'O'), ('ngày', 'O'), ('càng', 'O'), ('trở', 'O'), ('nên', 'O'), ('rạng', 'O'), ('rỡ', 'O'), ('đều', 'O'), ('màu', 'O'), ('và', 'O'), ('trẻ', 'O'), ('trung', 'O'), ('hơn', 'O')]
[('Trong', 'O'), ('mỹ', 'O'), ('phẩm', 'O'), ('Trehalose', 'B-NAM