In [None]:
import json

with open(r"C:\Users\ragur\Japanese_Learning_Application\Mnemonics\merged_kanji.json", "r", encoding="utf-8") as f:
    kanji_data = json.load(f)

# print(len(kanji_data))


2230


In [2]:
import pandas as pd

radicals_df = pd.read_csv(r"C:\Users\ragur\Japanese_Learning_Application\Mnemonics\radicals_with_visual_form_20251029_104855.csv")
print(f"✅ Loaded {len(radicals_df)} radicals.")


✅ Loaded 322 radicals.


In [3]:
import re

radical_docs = []

# Helper to keep only ASCII context (avoid Chinese/Japanese leaking into prompts)
def _to_ascii(text: str) -> str:
    return re.sub(r"[^\x00-\x7F]+", " ", str(text)).strip()

for _, row in radicals_df.iterrows():
    radical = _to_ascii(row["Radical"])            # enforce ASCII
    meaning = _to_ascii(row["Meaning"])            # enforce ASCII
    # Drop visual shapes or sanitize to ASCII to avoid non-English outputs
    visual = _to_ascii(row.get("Visual_Form", ""))
    # Keep context concise and English-only
    text = f"Radical: {radical}\nMeaning: {meaning}"
    # If sanitized visual still has value, you can include it by uncommenting:
    # if visual:
    #     text += f"\nVisual: {visual}"
    radical_docs.append(text)

print(f"✅ Prepared {len(radical_docs)} radical documents for embeddings.")

✅ Prepared 322 radical documents for embeddings.


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
radical_embeddings = embedder.encode(radical_docs, show_progress_bar=True)

dimension = radical_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(radical_embeddings))

print("✅ Radical FAISS index built!")


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

✅ Radical FAISS index built!


In [5]:
import faiss
import numpy as np

dimension = radical_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(radical_embeddings))

print("✅ FAISS index built!")

✅ FAISS index built!


In [6]:
def retrieve_relevant_radicals(query_text, top_k=3):
    query_emb = embedder.encode([query_text])
    D, I = index.search(np.array(query_emb), top_k)
    return [radical_docs[i] for i in I[0]]

In [None]:
# from transformers import pipeline

# pipe = pipeline(
#     "text-generation",
#     model="Qwen/Qwen2.5-1.5B-Instruct",
#     dtype="auto",
#     device_map="auto"
# )

# def generate_mnemonic(kanji):
#     import re
#     k = kanji.strip()
#     details = kanji_data.get(k)
#     if not details:
#         return f"Kanji {k} not found."

#     meanings = ", ".join(details.get("meanings", []))
#     radicals_en = details.get("wk_radicals", [])
#     lhs = " + ".join(radicals_en) if radicals_en else (meanings or "")

#     # Retrieve context from radicals DB to guide the model
#     radical_context = "\n\n".join(retrieve_relevant_radicals(" ".join(radicals_en)))

#     # Strict, one-line arrow-style output; no tables, no headers
#     prompt = f"""
# You are a Japanese Kanji mnemonic writer.
# Using the radical info below as context, write exactly ONE short mnemonic sentence in this strict format:
# {k} = {lhs} → <concise mnemonic sentence>.

# Rules:
# - Output ONE line only. No markdown, no tables, no labels, no extra text.
# - Keep it realistic and logical (e.g., 買 = Net + Shell → Buying involves covering valuable shells).
# - Max ~120 characters.

# Radical Database (context):
# {radical_context}
# """

#     resp = pipe(
#         prompt,
#         max_new_tokens=80,
#         do_sample=False,
#         temperature=0.2,
#         return_full_text=False,
#     )
#     text = resp[0]["generated_text"] if resp and isinstance(resp, list) else str(resp)

#     # Post-process to ensure a single clean line in the desired arrow style
#     line = (text or "").strip().splitlines()[0] if (text or "").strip() else ""
#     line = line.strip("` ").replace("|", " ").strip()
#     line = re.sub(r"\s+", " ", line)

#     if "→" not in line:
#         # Build a fallback arrow-style line from the response core
#         core = re.sub(r'^(Mnemonic\s*:\s*)', '', line, flags=re.I)
#         core = re.split(r'(?<=[.!?])\s+', core)[0].strip()
#         if not core:
#             core = (meanings or "remember this") + "."
#         elif not core.endswith(('.', '!', '?', '…')):
#             core += '.'
#         line = f"{k} = {lhs} → {core}"

#     return line

Device set to use cpu


In [12]:
from transformers import pipeline
import re

pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-1.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)

# Helper to strip CJK (Chinese/Japanese/Korean) characters but preserve everything else
_CJK_RE = re.compile(r"[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]+")

def _strip_cjk(s: str) -> str:
    return _CJK_RE.sub("", s or "")

def _one_sentence(text: str, max_chars: int = 120) -> str:
    t = (text or "").strip()
    # Remove any angle-bracketed prompts/templates
    t = re.sub(r"<[^>]*>", "", t)
    # If the model echoed another arrow, keep only the first segment
    t = t.split("→", 1)[0]
    # Take only the first sentence
    parts = re.split(r"(?<=[.!?])\s+", t)
    one = parts[0] if parts else t
    one = re.sub(r"\s+", " ", one).strip()
    if len(one) > max_chars:
        one = one[:max_chars].rstrip(" ,;:") + "…"
    return one

def generate_mnemonic(kanji):
    k = kanji.strip()
    details = kanji_data.get(k)
    if not details:
        return f"Kanji {k} not found."

    meanings = ", ".join(details.get("meanings", []))
    radicals_en = details.get("wk_radicals", [])
    lhs = " + ".join(radicals_en) if radicals_en else meanings

    # Retrieve strictly ASCII-only context
    radical_context = "\n\n".join(retrieve_relevant_radicals(" ".join(radicals_en)))

    prompt = f"""
You are a Kanji mnemonic generator.
Combine the radicals’ meanings to create a short, logical English mnemonic.

Follow this exact one-line format:
{k} = {lhs} → <short, clear mnemonic>

Rules:
- Use ONLY English words and ASCII punctuation. Never include Japanese/Chinese (kanji, kana, hanzi) in the mnemonic.
- Ignore any non-English visual descriptions; translate their ideas into simple English or omit them.
- Keep it under 120 characters.
- Output exactly one line. No extra text.

Example:
買 = Net + Shell → Buying involves catching valuable shells in a net.

Context about the radicals (English-only):
{radical_context}

Now generate for {k} ("{meanings}").
"""

    resp = pipe(
        prompt,
        max_new_tokens=80,
        temperature=0.4,   # slightly higher to allow creativity
        top_p=0.8,
        do_sample=True,
        return_full_text=False
    )

    text = resp[0]["generated_text"].strip() if resp and isinstance(resp, list) else str(resp).strip()
    text = re.sub(r"\s+", " ", text.replace("|", " "))

    # Try to extract a single arrow-style line
    match = re.search(rf"{re.escape(k)}\s*=\s*.*?→.*", text)
    if match:
        line = match.group(0).strip()
    else:
        # fallback: build manually
        line = f"{k} = {lhs} → Represents {meanings.lower()} through its parts."

    # Sanitize: remove CJK from the RHS while preserving the Kanji on the left; keep one sentence
    if "→" in line:
        left, right = line.split("→", 1)
        right = _strip_cjk(right)
        right = _one_sentence(right, max_chars=120)
        # Remove leading equals or duplicated LHS tokens echoed by the model
        right = re.sub(r"^[=\s]+", "", right)
        if lhs:
            lhs_pat = re.escape(lhs.strip())
            right = re.sub(rf"^(?:{lhs_pat}\b\s*:?\s*)+", "", right, flags=re.I)
        if meanings:
            meanings_pat = re.escape(meanings.strip())
            right = re.sub(rf"^(?:{meanings_pat}\b\s*:?\s*)+", "", right, flags=re.I)
        right = right.strip()
        if not right:
            right = _one_sentence(f"Represents {meanings.lower()}.", max_chars=120)
        if right and right[-1] not in ".!?…":
            right += "."
        line = f"{left.strip()} → {right}"

    return line

Some parameters are on the meta device because they were offloaded to the disk and cpu.
Device set to use cpu
Device set to use cpu


In [8]:
print(generate_mnemonic("人"))

人 = Person → is like a dot on top of two sticks forming a cross, symbolizing someone standing upright with arms outstretched.


_________________________________________________________________________________________________________________________________________________________________

In [13]:
# Batch v3 (rewritten): use generate_mnemonic directly, enforce English-only and one-line outputs
import os, json, time, re
from datetime import datetime
from typing import Dict, Any, List

# Uses helpers defined earlier: _strip_cjk, _one_sentence, and the function generate_mnemonic
try:
    from tqdm import tqdm as _tqdm
except Exception:
    def _tqdm(x, **kwargs):
        return x

def _reminder_from_line(line: str, meaning: str) -> str:
    rhs = line.split("→", 1)[1] if "→" in line else (meaning or "")
    rhs = _strip_cjk(rhs)
    rhs = _one_sentence(rhs, max_chars=40)
    return rhs if rhs else _one_sentence(meaning or "core idea", max_chars=24)

def run_batch_with_resume_v3(output_path: str = "generated_mnemonics_v3.jsonl", batch_size: int = 10, limit: int = 0):
    # Resume support
    done: Dict[str, Any] = {}
    if os.path.exists(output_path):
        with open(output_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    obj = json.loads(line)
                except Exception:
                    continue
                k = obj.get('kanji')
                if k:
                    done[k] = True

    all_keys: List[str] = list(kanji_data.keys())
    todo_keys: List[str] = [k for k in all_keys if k not in done]
    if limit and limit > 0:
        todo_keys = todo_keys[:limit]

    print(f"Starting v3 batch (generate_mnemonic): {len(todo_keys)} kanji — output: {output_path}")
    if not todo_keys:
        print("Nothing to do.")
        return

    written = 0
    start = time.time()
    with open(output_path, 'a', encoding='utf-8') as out:
        for k in _tqdm(todo_keys, desc="Generating mnemonics"):
            try:
                line = generate_mnemonic(k)  # one-line arrow-style, English-only sanitized inside
            except Exception as e:
                print(f"❌ Error for {k}: {e}")
                continue
            # Extract meaning
            info = kanji_data.get(k, {}) or {}
            mlist = info.get('meanings') or []
            meaning = (mlist[0] if isinstance(mlist, list) and mlist else info.get('meaning') or '')
            reminder = _reminder_from_line(line, meaning)
            out.write(json.dumps({
                'kanji': k,
                'mnemonic': line,
                'meaning': meaning,
                'reminder': reminder,
                'created_at': datetime.utcnow().isoformat() + 'Z'
            }, ensure_ascii=False) + "\n")
            written += 1
    dur = time.time() - start
    print(f"✅ v3 done — wrote {written} records in {dur:.1f}s")

In [15]:
def _retrieve_context_for_kanji(query):
	return retrieve_relevant_radicals(query)

# Tiny v3 validation (limit=3) — should produce short arrow-style lines
run_batch_with_resume_v3(output_path="generated_mnemonics_v3.jsonl", batch_size=3, limit=9)

Starting v3 batch (generate_mnemonic): 9 kanji — output: generated_mnemonics_v3.jsonl


Generating mnemonics: 100%|██████████| 9/9 [12:58<00:00, 86.45s/it]

✅ v3 done — wrote 9 records in 778.1s



