In [1]:
from datasets import load_dataset
data = load_dataset("phuong123/icd_icf_en_vi")

In [2]:
import sqlite3

DB_PATH = "dictionary.db"

conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

# === T·∫°o b·∫£ng ===
cur.execute("""
CREATE TABLE IF NOT EXISTS dict (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    english TEXT,
    vietnamese TEXT
)
""")

cur.execute("CREATE INDEX IF NOT EXISTS idx_en ON dict(english)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_vi ON dict(vietnamese)")

# === N·∫°p d·ªØ li·ªáu an to√†n ===
insert_query = "INSERT INTO dict (english, vietnamese) VALUES (?, ?)"
unique_entries = set()

for source_name, dataset in data.items():
    if source_name != "adapt":  # b·ªè qua n·∫øu kh√¥ng c·∫ßn
        count_before = len(unique_entries)
        entries_to_insert = []
        
        for item in dataset:
            en = item.get("en")
            vi = item.get("vi")
            
            # B·ªè qua n·∫øu thi·∫øu
            if not en or not vi:
                continue
            
            # Chu·∫©n h√≥a
            en = en.strip().lower() if isinstance(en, str) else ""
            vi = vi.strip().lower() if isinstance(vi, str) else ""
            
            key = (en, vi)
            if key in unique_entries:
                continue
            unique_entries.add(key)
            entries_to_insert.append((en, vi))
        
        # Insert nhi·ªÅu d√≤ng c√πng l√∫c
        cur.executemany(insert_query, entries_to_insert)
        count_after = len(unique_entries)
        print(f"‚Üí N·∫°p {count_after - count_before} d√≤ng t·ª´ ngu·ªìn {source_name}")

conn.commit()
conn.close()
print(f"‚úÖ Database created successfully at {DB_PATH}")
print(f"üìä T·ªïng s·ªë d√≤ng unique: {len(unique_entries)}")


‚Üí N·∫°p 21555 d√≤ng t·ª´ ngu·ªìn icd_icf
‚Üí N·∫°p 16808 d√≤ng t·ª´ ngu·ªìn dictionary
‚úÖ Database created successfully at dictionary.db
üìä T·ªïng s·ªë d√≤ng unique: 38363


In [25]:
import sqlite3
import ahocorasick

DB_PATH = "dictionary.db"

# === H√†m kh·ªüi t·∫°o k·∫øt n·ªëi ===
def get_connection():
    return sqlite3.connect(DB_PATH)

# === Tra 1 t·ª´ (Anh ho·∫∑c Vi·ªát) ===
def translate(word):
    conn = get_connection()
    word = word.strip().lower()
    rows = conn.execute("""
        SELECT english, vietnamese
        FROM dict
        WHERE english = ? OR vietnamese = ?
        LIMIT 20
    """, (word, word)).fetchall()
    conn.close()
    return [{"en": r[0], "vi": r[1]} for r in rows]

# === T·∫£i to√†n b·ªô t·ª´ ƒëi·ªÉn v√†o b·ªô nh·ªõ (ƒë·ªÉ tra nhanh trong c√¢u) ===
def load_all_terms():
    conn = get_connection()
    rows = conn.execute("SELECT english, vietnamese FROM dict").fetchall()
    conn.close()
    return [(en, vi) for en, vi in rows]

# === X√¢y b·ªô t√¨m ki·∫øm Aho‚ÄìCorasick ===
def build_automaton(term_pairs):
    A = ahocorasick.Automaton()
    for en, vi in term_pairs:
        if en:  # ch·ªâ th√™m n·∫øu kh√¥ng r·ªóng
            A.add_word(en, (en, vi))
    A.make_automaton()
    return A

# === T√¨m c√°c thu·∫≠t ng·ªØ trong c√¢u ===
def find_terms_in_sentence(sentence, automaton):
    sentence = sentence.lower()
    results = []
    seen = set()
    for end_idx, (en, vi) in automaton.iter(sentence):
        key = (en, vi)
        if key not in seen and key[0] in sentence:
            seen.add(key)
            results.append({"en": en, "vi": vi})
    return results


In [26]:

# 1Ô∏è‚É£ Tra 1 t·ª´
print(translate("buff"))
# ‚Üí [{'english': 'heart', 'vietnamese': 'tim', 'source': 'dictionary'}]

# 2Ô∏è‚É£ T√¨m thu·∫≠t ng·ªØ trong c√¢u
terms = load_all_terms()
A = build_automaton(terms)

sentence = "After precipating, diclofenac sodium was eluted by HPLC using a reverse-phase column (C8, 150 mm x 4.6 mm, 5 Œºm), mobile phase contains methanol / phosphat buffer (70: 30 v / v, pH 2.5), at a flow rate of 1.0 ml / min, and a wavelength detection at 275 nm."
print(find_terms_in_sentence(sentence, A))

[{'en': 'buff', 'vi': 'da tr√¢u, da b√≤'}]
[{'en': 'sodium', 'vi': 'ch·∫•t c∆° b·∫£n c√≥ trong mu·ªëi'}, {'en': 'buff', 'vi': 'da tr√¢u, da b√≤'}, {'en': 'buffer', 'vi': 'ch·∫•t c√¢n b·∫±ng t√¨nh tr·∫°ng toan-ki·ªÅm'}, {'en': 'ion', 'vi': 'ch·∫•t nguy√™n t·ª≠ ion'}]


# RAG_loop_test

In [19]:
from datasets import load_dataset, Dataset

ds = load_dataset("thviet79/MT_Medical")

def split_parallel(dataset):
    n = len(dataset) // 2
    en_texts = dataset["text"][:n]
    vi_texts = dataset["text"][n:]
    return Dataset.from_dict({"en": en_texts, "vi": vi_texts})

test_split = split_parallel(ds["test"])

In [20]:
from tqdm import tqdm

output = []
terms = load_all_terms()
A = build_automaton(terms)
for item in tqdm(test_split):
    output.append({"en": item["en"], 
                   "dictionary" : find_terms_in_sentence(item["en"], A), 
                   "vi": item["vi"]}
                   )
    

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [00:00<00:00, 30302.75it/s]


In [21]:
import json
with open("RAG_test_output.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)

In [27]:
def build_prompt(batch_en, dictory):
    # Gh√©p t·∫•t c·∫£ dictionary trong batch th√†nh text
    term_lines = []
    seen = set()
    for term_list in dictory:
        if term_list:  
            for term in term_list:
                en = term.get("en", "").strip()
                vi = term.get("vi", "").strip()
                if en and vi and (en, vi) not in seen:
                    term_lines.append(f"- {en} ‚Üí {vi}")
                    seen.add((en, vi))
    term_text = "\n".join(term_lines[:100])
    
    # Gh√©p c√°c c√¢u c·∫ßn d·ªãch
    sentences_text = "\n".join([f"{s}" for s in batch_en])
    
    prompt = f"""
        B·∫°n l√† chuy√™n gia d·ªãch thu·∫≠t y h·ªçc Anh‚ÄìVi·ªát, c√≥ kinh nghi·ªám bi√™n t·∫≠p b√†i b√°o khoa h·ªçc v√† b√°o c√°o nghi√™n c·ª©u trong c√°c lƒ©nh v·ª±c Y h·ªçc, D∆∞·ª£c h·ªçc, v√† S·ª©c kh·ªèe c·ªông ƒë·ªìng.

        Nhi·ªám v·ª•:
        - D·ªãch c√°c c√¢u ti·∫øng Anh sang ti·∫øng Vi·ªát h·ªçc thu·∫≠t, ƒë·∫£m b·∫£o **ƒë·∫ßy ƒë·ªß th√¥ng tin**, kh√¥ng b·ªè s√≥t hay r√∫t g·ªçn.
        - M·ªói m·ªánh ƒë·ªÅ, c·ª•m danh t·ª´, c·ª•m ƒë·ªông t·ª´ trong ti·∫øng Anh ph·∫£i c√≥ ph·∫ßn t∆∞∆°ng ·ª©ng trong b·∫£n d·ªãch.
        - C√≥ th·ªÉ ƒëi·ªÅu ch·ªânh tr·∫≠t t·ª± c√¢u ƒë·ªÉ t·ª± nhi√™n h∆°n trong ti·∫øng Vi·ªát, **nh∆∞ng kh√¥ng l√†m thay ƒë·ªïi quan h·ªá ng·ªØ nghƒ©a**.
        - ∆Øu ti√™n s·ª≠ d·ª•ng ƒë√∫ng c√°c thu·∫≠t ng·ªØ y khoa trong danh s√°ch sau:
        {term_text}

        Y√™u c·∫ßu:
        - Gi·ªØ nguy√™n √Ω nghƒ©a, c·∫•u tr√∫c ng·ªØ ph√°p t∆∞∆°ng ·ª©ng gi·ªØa hai ng√¥n ng·ªØ.
        - Di·ªÖn ƒë·∫°t tr√¥i ch·∫£y, mang phong c√°ch h·ªçc thu·∫≠t ti·∫øng Vi·ªát.
        - Thu·∫≠t ng·ªØ y h·ªçc ph·∫£i ch√≠nh x√°c v√† th·ªëng nh·∫•t.
        - Ch·ªâ tr·∫£ v·ªÅ b·∫£n d·ªãch ti·∫øng Vi·ªát ho√†n ch·ªânh, kh√¥ng k√®m ch√∫ th√≠ch ho·∫∑c ph√¢n t√≠ch.

        V√≠ d·ª•:
        English: "Mice in each group were assessed for weight weekly and the levels of Total Cholesterol (CT), HDL-Cholesterol (HDL-C), LDL-Cholesterol (LDL-C) and Triglyceride (TG) were recorded at initial time (after obesity was induced for 8 weeks) and 1 hour after taking the extracted mixtures on the last day."
        Vietnamese: "Tr·ªçng l∆∞·ª£ng chu·ªôt ·ªü m·ªói nh√≥m ƒë∆∞·ª£c ƒë√°nh gi√° h√†ng tu·∫ßn v√† c√°c ch·ªâ s·ªë Cholesterol to√†n ph·∫ßn (CT), HDL-Cholesterol (HDL-C), LDL-Cholesterol (LDL-C) v√† Triglycerid (TG) ƒë∆∞·ª£c ghi nh·∫≠n t·∫°i th·ªùi ƒëi·ªÉm ban ƒë·∫ßu (sau 8 tu·∫ßn g√¢y b√©o ph√¨) v√† 1 gi·ªù sau khi u·ªëng h·ªón h·ª£p chi·∫øt xu·∫•t v√†o ng√†y cu·ªëi c√πng."

        B√¢y gi·ªù, h√£y d·ªãch c√°c c√¢u sau:
        {sentences_text}
    """

    return prompt.strip()

In [29]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import google.generativeai as genai
import time
import json


with open("RAG_test_output_dict.json", "r", encoding="utf-8") as f:
    data_test = json.load(f)

genai.configure(api_key="AIzaSyB5zDpFEqzEQmBGK3axkLSqUKbNiUxzUWQ")
MODEL_NAME = "gemini-2.5-flash-lite"
model = genai.GenerativeModel(MODEL_NAME)

output = []
batch_size = 8


for i in tqdm(range(0, len(data_test), batch_size)):
    translations = data_test[i : min(i + batch_size, len(data_test))]
    
    batch_en = [t["en"] for t in translations]
    batch_vi_label = [t["vi"] for t in translations]
    dictory = [t["dictionary"] for t in translations]
    
    prompt = build_prompt(batch_en, dictory)
    #print(f"--- Prompt for batch {i} ---")
    #print(prompt)
    
    # G·ªçi model
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0,
                "max_output_tokens": 2048
            }
        )
        
        if response and response.text:
            vi_texts = response.text.strip()
            # T√°ch v√† l√†m s·∫°ch k·∫øt qu·∫£
            translations = []
            for line in vi_texts.split("\n"):
                line = line.strip()
                # Lo·∫°i b·ªè s·ªë th·ª© t·ª± n·∫øu c√≥
                if line and line[0].isdigit() and '.' in line[:3]:
                    line = line.split('.', 1)[1].strip()
                if line:
                    translations.append(line)
            
            # Ki·ªÉm tra v√† c√¢n b·∫±ng s·ªë l∆∞·ª£ng
            if len(translations) < len(batch_en):
                translations.extend(["[MISSING]"] * (len(batch_en) - len(translations)))
            elif len(translations) > len(batch_en):
                translations = translations[:len(batch_en)]
                
        else:
            translations = ["[EMPTY_RESPONSE]"] * len(batch_en)
            
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói khi d·ªãch batch {i}: {e}")
        translations = ["[ERROR]"] * len(batch_en)
    
    # L∆∞u k·∫øt qu·∫£
    for k in range(len(batch_en)):
        output.append({
            "en": batch_en[k],
            "vi_pred": translations[k],
            "vi_label": batch_vi_label[k]
        })
    
    # Ngh·ªâ gi·ªØa c√°c batch
    time.sleep(3)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 375/375 [35:06<00:00,  5.62s/it]


In [30]:
import json
with open("RAG_test_output.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)