# üì¶ Build Custom Japanese IME Dataset (10M Target)

Create a high-quality 10M-item conversational dataset for keyboard IME training.

**3-Layer Strategy:**

| Layer | Source | Purpose | Target |
|:---:|---|---|---:|
| üß± Base | OSCAR Japanese web text | Grammar foundations | ~5M pairs |
| üí¨ Conversation | Sh≈çsetsuka ni Nar≈ç (web novels) | Casual dialogue | ~3M pairs |
| üî• Freshness | RSS feeds (2026 news/tech) | Modern vocabulary | ~2M pairs |

**Pipeline:** Raw text ‚Üí Extract sentences ‚Üí Kill filter ‚Üí SudachiPy kana ‚Üí Augment slices ‚Üí JSONL

In [None]:
import os

# Auto-detect platform
if 'COLAB_RELEASE_TAG' in os.environ:
    PLATFORM = 'Colab'
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
elif os.path.exists('/kaggle/working'):
    PLATFORM = 'Kaggle'
    DRIVE_DIR = '/kaggle/working'
else:
    PLATFORM = 'Local'
    DRIVE_DIR = './output'

OUTPUT_DIR = f"{DRIVE_DIR}/datasets"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"‚úÖ Platform: {PLATFORM}")
print(f"üìÅ Output: {OUTPUT_DIR}")

In [None]:
!pip install -q datasets tqdm sudachipy sudachidict_full feedparser requests beautifulsoup4

In [None]:
# ===========================================================
# CONFIGURATION
# ===========================================================

# Total target
TOTAL_TARGET = 10_000_000  # 10M training pairs

# Per-layer targets
LAYER_1_TARGET = 5_000_000   # OSCAR base (grammar)
LAYER_2_TARGET = 3_000_000   # Nar≈ç novels (conversation)
LAYER_3_TARGET = 2_000_000   # RSS feeds (freshness)

# Raw lines to stream per layer (slicing creates ~5-8x pairs per sentence)
LAYER_1_RAW_LINES = 1_500_000   # OSCAR
LAYER_2_MAX_NOVELS = 5          # 2-5 novels (each has 100s of chapters with rich dialogue)
LAYER_3_MAX_ARTICLES = 5_000    # RSS articles

# Quality filters
MIN_SENTENCE_LEN = 5
MAX_SENTENCE_LEN = 120

# Augmentation
ENABLE_SLICING = True
MIN_SLICE_WORDS = 2

# Which layers to run (set False to skip)
RUN_LAYER_1 = True   # OSCAR base
RUN_LAYER_2 = True   # Nar≈ç novels
RUN_LAYER_3 = True   # RSS feeds

print(f"üéØ Target: {TOTAL_TARGET:,} pairs")
print(f"   Layer 1 (OSCAR):  {LAYER_1_TARGET:,} {'‚úÖ' if RUN_LAYER_1 else '‚è≠Ô∏è'}")
print(f"   Layer 2 (Nar≈ç):   {LAYER_2_TARGET:,} {'‚úÖ' if RUN_LAYER_2 else '‚è≠Ô∏è'}")
print(f"   Layer 3 (RSS):    {LAYER_3_TARGET:,} {'‚úÖ' if RUN_LAYER_3 else '‚è≠Ô∏è'}")

## 1. Core Pipeline: Reverse-Kana + Extraction + Filters

In [None]:
from sudachipy import tokenizer as sudachi_tokenizer, dictionary as sudachi_dictionary
import re
import json
from collections import Counter

# ==========================================================
# SudachiPy setup (Mode C = longest splits, best for IME)
# ==========================================================
sudachi_dict = sudachi_dictionary.Dictionary(dict="full")
sudachi = sudachi_dict.create()
SPLIT_MODE = sudachi_tokenizer.Tokenizer.SplitMode.C

print("‚úÖ SudachiPy (full dict, Mode C) ready")


# ==========================================================
# Reverse-Kana Pipeline
# ==========================================================

def text_to_morphemes(text):
    """Analyze text into morphemes with readings."""
    tokens = sudachi.tokenize(text, SPLIT_MODE)
    result = []
    for t in tokens:
        surface = t.surface()
        reading = t.reading_form() or surface
        pos = t.part_of_speech()[0]
        result.append((surface, reading, pos))
    return result


def text_to_kana(text):
    """Convert text to full katakana reading."""
    return ''.join(r for _, r, _ in text_to_morphemes(text))


def generate_pairs(text, left_context='', enable_slicing=True):
    """Generate training pairs (full + sliced) from text."""
    morphemes = text_to_morphemes(text)
    if len(morphemes) < MIN_SLICE_WORDS:
        return []
    
    pairs = []
    
    # Full sentence pair
    full_kana = ''.join(r for _, r, _ in morphemes)
    full_text = ''.join(s for s, _, _ in morphemes)
    if full_kana != full_text:
        pairs.append({
            'left_context': left_context,
            'input': full_kana,
            'output': full_text
        })
    
    if not enable_slicing:
        return pairs
    
    # Sliced pairs at word boundaries (1, 2, 3 word slices)
    for slice_size in [1, 2, 3]:
        for i in range(0, len(morphemes) - slice_size + 1):
            ctx_parts = ''.join(s for s, _, _ in morphemes[:i])
            ctx = left_context + ctx_parts
            
            s_morphemes = morphemes[i:i + slice_size]
            s_kana = ''.join(r for _, r, _ in s_morphemes)
            s_text = ''.join(s for s, _, _ in s_morphemes)
            
            if s_kana == s_text or len(s_text) < 1:
                continue
            if re.match(r'^[„ÄÅ„ÄÇÔºüÔºÅ\s]+$', s_text):
                continue
            
            pairs.append({
                'left_context': ctx[-60:] if len(ctx) > 60 else ctx,
                'input': s_kana,
                'output': s_text
            })
    
    return pairs


# ==========================================================
# Quality Filters (Kill List)
# ==========================================================

KILL_PATTERNS = re.compile(
    r'(ID:|„Ç¢„Ç§„Éá„Ç£:|ÊäïÁ®øÊó•|ÂêçÂâç:|ÂêçÁÑ°„Åó|'
    r'\d{2}:\d{2}:\d{2}|'
    r'(http|www\.|https)|'
    r'ISBN|ISSN|'
    r'„Ç®„Ç§„ÉÅ„ÉÜ„Ç£„Éº„ÉÜ„Ç£„Éº„Éî„Éº|'
    r'„ÉÄ„Éñ„É™„É•„ÉÄ„Éñ„É™„É•)',
    re.IGNORECASE
)
GARBAGE_RE = re.compile(r'[\u0400-\u04FF\u0100-\u024F]')
ENCYCLOPEDIA_RE = re.compile(
    r'(„Å´‰ΩçÁΩÆ„Åô„Çã|ÂåóÁ∑Ø\d|ÂçóÁ∑Ø\d|Êù±Áµå\d|Ë•øÁµå\d|Ê®ôÈ´ò\d|Â≠¶Âêç|ÂàÜÈ°ûÂ≠¶|Á¨¨\d+‰ª£|Á¥ÄÂÖÉÂâç)'
)
JAPANESE_RE = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]')
DIALOGUE_RE = re.compile(r'„Äå([^„Äç]{2,80})„Äç')


def extract_sentences(text):
    """Extract clean sentences. Returns [(sentence, is_dialogue), ...]"""
    results = []
    if not JAPANESE_RE.search(text):
        return []
    
    # Dialogue first (highest quality)
    for d in DIALOGUE_RE.findall(text):
        d = d.strip()
        if MIN_SENTENCE_LEN <= len(d) <= MAX_SENTENCE_LEN:
            if not KILL_PATTERNS.search(d) and not GARBAGE_RE.search(d):
                results.append((d, True))
    
    # Non-dialogue sentences
    for s in re.split(r'[„ÄÇÔºÅÔºü\n]+', text):
        s = s.strip()
        if not s or len(s) < MIN_SENTENCE_LEN or len(s) > MAX_SENTENCE_LEN:
            continue
        if KILL_PATTERNS.search(s) or GARBAGE_RE.search(s) or ENCYCLOPEDIA_RE.search(s):
            continue
        jp_ratio = len(JAPANESE_RE.findall(s)) / max(len(s), 1)
        if jp_ratio < 0.3 or len(re.findall(r'\d+', s)) >= 3:
            continue
        results.append((s, False))
    
    return results


print("‚úÖ Pipeline + filters ready")

# Quick test
test = 'Áî∞‰∏≠„Åï„Çì„ÅØ„Äå„Éû„Ç∏„ÅßÔºü„Äç„Å®Ë®Ä„Å£„Åü„ÄÇ‰ªäÊó•„ÅØÂ§©Ê∞ó„Åå„ÅÑ„ÅÑ„Åß„Åô„Å≠„ÄÇ'
sents = extract_sentences(test)
for s, d in sents:
    tag = 'üí¨' if d else 'üìù'
    kana = text_to_kana(s)
    print(f"  {tag} {s} ‚Üí {kana}")

In [None]:
# ==========================================================
# Shared processing function for all layers
# ==========================================================

from tqdm.auto import tqdm

def process_text_lines(lines_iter, output_file, max_pairs, layer_name,
                       max_lines=None, resume_from=0):
    """Process text lines through the full pipeline.
    
    Args:
        lines_iter: Iterator of text strings
        output_file: Path to save JSONL
        max_pairs: Stop after this many pairs
        layer_name: For display
        max_lines: Max input lines to process
        resume_from: Resume from this line index
    Returns: Stats dict
    """
    progress_file = output_file + '.progress'
    
    stats = {
        'lines_processed': 0,
        'sentences': 0,
        'dialogues': 0,
        'pairs': 0,
        'errors': 0,
    }
    
    # Resume check
    if resume_from > 0 and os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            saved = json.load(f)
        stats.update(saved.get('stats', {}))
        print(f"üìÇ Resuming {layer_name} from line {resume_from:,} ({stats['pairs']:,} pairs so far)")
    
    mode = 'a' if resume_from > 0 else 'w'
    prev_sentence = ''
    line_idx = 0
    
    with open(output_file, mode, encoding='utf-8') as out_f:
        pbar = tqdm(desc=f"{layer_name}", total=max_lines)
        
        for text in lines_iter:
            line_idx += 1
            if line_idx <= resume_from:
                pbar.update(1)
                continue
            
            stats['lines_processed'] += 1
            pbar.update(1)
            
            sentences = extract_sentences(text)
            prev_sentence = ''
            
            for sentence, is_dialogue in sentences:
                stats['sentences'] += 1
                if is_dialogue:
                    stats['dialogues'] += 1
                
                try:
                    pairs = generate_pairs(sentence, prev_sentence, ENABLE_SLICING)
                    for p in pairs:
                        out_f.write(json.dumps(p, ensure_ascii=False) + '\n')
                    stats['pairs'] += len(pairs)
                    prev_sentence = sentence
                except Exception as e:
                    stats['errors'] += 1
                    if stats['errors'] <= 3:
                        print(f"  ‚ö† {e}: {sentence[:30]}")
            
            # Save progress every 20K lines
            if stats['lines_processed'] % 20_000 == 0:
                with open(progress_file, 'w') as pf:
                    json.dump({'line_idx': line_idx, 'stats': stats}, pf)
                out_f.flush()
                pbar.set_postfix(pairs=f"{stats['pairs']:,}")
            
            # Stop conditions
            if stats['pairs'] >= max_pairs:
                print(f"\n  ‚úì Reached {max_pairs:,} pairs target")
                break
            if max_lines and stats['lines_processed'] >= max_lines:
                break
        
        pbar.close()
    
    # Final progress save
    with open(progress_file, 'w') as pf:
        json.dump({'line_idx': line_idx, 'stats': stats}, pf)
    
    print(f"  ‚úÖ {layer_name}: {stats['pairs']:,} pairs from {stats['sentences']:,} sentences")
    print(f"     Dialogues: {stats['dialogues']:,} | Errors: {stats['errors']}")
    
    return stats

---

## üß± Layer 1: CulturaX ‚Äî Base Grammar (~5M pairs)

CulturaX is a cleaned, deduplicated web text corpus (6.3T tokens, 167 languages).

**Why CulturaX?** `cc100`, `mc4`, `OSCAR` all use deprecated dataset scripts.
CulturaX uses modern Parquet format ‚Äî works perfectly with latest HuggingFace.

In [None]:
if RUN_LAYER_1:
    from datasets import load_dataset
    
    print("üì• Layer 1: Loading Japanese web text (streaming)...")
    print("   This streams data ‚Äî no full download needed.")
    
    # Try sources in order (all use Parquet, no deprecated scripts)
    base_ds = None
    text_key = "text"
    
    # Option 1: CulturaX (cleaned web text, Parquet, no gating)
    try:
        print("   Trying CulturaX (ja)...")
        base_ds = load_dataset(
            "uonlp/CulturaX",
            "ja",
            split="train",
            streaming=True
        )
        text_key = "text"
        print("   ‚úì CulturaX loaded")
    except Exception as e:
        print(f"   ‚ö† CulturaX failed: {e}")
    
    # Option 2: FineWeb-2 Japanese
    if base_ds is None:
        try:
            print("   Trying FineWeb-2 (jpn_Jpan)...")
            base_ds = load_dataset(
                "HuggingFaceFW/fineweb-2",
                "jpn_Jpan",
                split="train",
                streaming=True
            )
            text_key = "text"
            print("   ‚úì FineWeb-2 loaded")
        except Exception as e:
            print(f"   ‚ö† FineWeb-2 failed: {e}")
    
    # Option 3: Wikipedia Japanese (always works, Parquet)
    if base_ds is None:
        try:
            print("   Trying Wikipedia Japanese (fallback)...")
            base_ds = load_dataset(
                "wikimedia/wikipedia",
                "20231101.ja",
                split="train",
                streaming=True
            )
            text_key = "text"
            print("   ‚úì Wikipedia-ja loaded (fallback)")
        except Exception as e:
            print(f"   ‚ùå All sources failed: {e}")
            base_ds = None
    
    if base_ds is not None:
        def base_lines():
            for item in base_ds:
                yield item.get(text_key, "")
        
        OUTPUT_L1 = f"{OUTPUT_DIR}/layer1_base.jsonl"
        
        stats_l1 = process_text_lines(
            lines_iter=base_lines(),
            output_file=OUTPUT_L1,
            max_pairs=LAYER_1_TARGET,
            max_lines=LAYER_1_RAW_LINES,
            layer_name="üß± Base Grammar"
        )
        
        print(f"
üíæ Layer 1 saved: {OUTPUT_L1}")
        print(f"   Size: {os.path.getsize(OUTPUT_L1) / (1024**2):.1f} MB")
    else:
        stats_l1 = {"pairs": 0}
        print("‚ùå Layer 1 skipped (no data source available)")
else:
    print("‚è≠Ô∏è Layer 1 skipped")
    stats_l1 = {"pairs": 0}

---

## üí¨ Layer 2: Sh≈çsetsuka ni Nar≈ç ‚Äî Conversation (~3M pairs)

Web novels are the **#1 source** for modern Japanese dialogue.
- Massive amounts of dialogue in `„Äå...„Äç` brackets
- Modern grammar, slang, natural speech
- Genres: Contemporary Drama (Áèæ‰ª£„Éâ„É©„Éû), Romance (ÊÅãÊÑõ), Daily Life (Êó•Â∏∏)

API: `https://api.syosetu.com/novelapi/api/`

In [None]:
if RUN_LAYER_2:
    import requests
    from bs4 import BeautifulSoup
    import time
    
    # ==========================================================
    # Nar≈ç Novel Scraper
    # ==========================================================
    
    NARO_API = 'https://api.syosetu.com/novelapi/api/'
    NARO_BASE = 'https://ncode.syosetu.com'
    
    # Genres for conversational text
    # 101=Áï∞‰∏ñÁïå, 102=ÁèæÂÆü‰∏ñÁïå, 201=„Éè„Ç§„Éï„Ç°„É≥„Çø„Ç∏„Éº, 301=ÊÅãÊÑõ
    # We want realistic/modern: 102 (ÁèæÂÆü‰∏ñÁïå), 301-302 (ÊÅãÊÑõ), 401 (Êó•Â∏∏)
    NARO_GENRES = [102, 301, 302, 401, 9901, 9902]  # Realistic, Romance, Daily, Other
    
    def get_novel_list(genre, limit=50, order='hyoka'):
        """Get top-rated novels from Nar≈ç API.
        order: hyoka=rating, favnovelcnt=favorites, weeklypoint=weekly
        """
        params = {
            'out': 'json',
            'genre': genre,
            'order': order,
            'lim': limit,
            'of': 'n-t-ga',  # ncode, title, general_all_no (total chapters)
        }
        try:
            resp = requests.get(NARO_API, params=params, timeout=30)
            resp.raise_for_status()
            data = resp.json()
            # First item is metadata (allcount), rest are novels
            return data[1:] if len(data) > 1 else []
        except Exception as e:
            print(f"  ‚ö† API error: {e}")
            return []
    
    
    def scrape_chapter(ncode, chapter_no):
        """Scrape a single chapter's text from Nar≈ç."""
        url = f"{NARO_BASE}/{ncode}/{chapter_no}/"
        try:
            resp = requests.get(url, timeout=30, headers={
                'User-Agent': 'Mozilla/5.0 (IME Dataset Research)'
            })
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, 'html.parser')
            
            # Novel body is in <div id="novel_honbun">
            body = soup.find('div', id='novel_honbun')
            if not body:
                return ''
            
            # Get text from <p> tags
            lines = []
            for p in body.find_all('p'):
                text = p.get_text(strip=True)
                if text:
                    lines.append(text)
            
            return '\n'.join(lines)
        except Exception as e:
            return ''
    
    
    # Collect novel list
    print("üì• Layer 2: Fetching Nar≈ç novel list...")
    all_novels = []
    for genre in NARO_GENRES:
        novels = get_novel_list(genre, limit=50, order='hyoka')
        all_novels.extend(novels)
        print(f"  Genre {genre}: {len(novels)} novels")
        time.sleep(1)  # Be polite to API
    
    # Deduplicate by ncode
    seen = set()
    unique_novels = []
    for n in all_novels:
        ncode = n.get('ncode', '')
        if ncode and ncode not in seen:
            seen.add(ncode)
            unique_novels.append(n)
    
    # Limit to configured max
    unique_novels = unique_novels[:LAYER_2_MAX_NOVELS]
    total_chapters = sum(n.get('general_all_no', 0) for n in unique_novels)
    
    print(f"  ‚úì {len(unique_novels)} unique novels, ~{total_chapters:,} total chapters")
    print(f"  Scraping chapters (1 req/sec, be polite)...")
else:
    print("‚è≠Ô∏è Layer 2 skipped")

In [None]:
if RUN_LAYER_2:
    OUTPUT_L2 = f"{OUTPUT_DIR}/layer2_narou.jsonl"
    PROGRESS_L2 = f"{OUTPUT_DIR}/layer2_progress.json"
    
    # Resume support
    processed_novels = set()
    l2_pairs = 0
    if os.path.exists(PROGRESS_L2):
        with open(PROGRESS_L2, 'r') as f:
            progress = json.load(f)
        processed_novels = set(progress.get('processed_ncodes', []))
        l2_pairs = progress.get('pairs', 0)
        print(f"üìÇ Resuming: {len(processed_novels)} novels done, {l2_pairs:,} pairs")
    
    l2_stats = {
        'novels': len(processed_novels),
        'chapters': 0,
        'sentences': 0,
        'dialogues': 0,
        'pairs': l2_pairs,
        'errors': 0,
    }
    
    mode = 'a' if l2_pairs > 0 else 'w'
    
    with open(OUTPUT_L2, mode, encoding='utf-8') as out_f:
        pbar = tqdm(unique_novels, desc="üí¨ Nar≈ç Novels")
        
        for novel in pbar:
            ncode = novel.get('ncode', '')
            if not ncode or ncode in processed_novels:
                continue
            
            title = novel.get('title', 'unknown')
            total_ch = novel.get('general_all_no', 0)
            max_ch = min(total_ch, 200)  # Up to 200 chapters per novel (novels are huge)
            
            for ch in range(1, max_ch + 1):
                chapter_text = scrape_chapter(ncode, ch)
                if not chapter_text:
                    continue
                
                l2_stats['chapters'] += 1
                sentences = extract_sentences(chapter_text)
                prev = ''
                
                for sentence, is_dialogue in sentences:
                    l2_stats['sentences'] += 1
                    if is_dialogue:
                        l2_stats['dialogues'] += 1
                    
                    try:
                        pairs = generate_pairs(sentence, prev, ENABLE_SLICING)
                        for p in pairs:
                            p['source'] = 'narou'  # Tag source
                            out_f.write(json.dumps(p, ensure_ascii=False) + '\n')
                        l2_stats['pairs'] += len(pairs)
                        prev = sentence
                    except:
                        l2_stats['errors'] += 1
                
                # Rate limit: 1 request per second
                time.sleep(1)
            
            # Mark novel as done
            processed_novels.add(ncode)
            l2_stats['novels'] += 1
            pbar.set_postfix(pairs=f"{l2_stats['pairs']:,}")
            
            # Save progress per novel
            with open(PROGRESS_L2, 'w') as pf:
                json.dump({
                    'processed_ncodes': list(processed_novels),
                    'pairs': l2_stats['pairs'],
                }, pf)
            out_f.flush()
            
            # Stop if target reached
            if l2_stats['pairs'] >= LAYER_2_TARGET:
                print(f"\n  ‚úì Reached {LAYER_2_TARGET:,} target")
                break
    
    print(f"\n  ‚úÖ Layer 2: {l2_stats['pairs']:,} pairs")
    print(f"     Novels: {l2_stats['novels']} | Chapters: {l2_stats['chapters']:,}")
    print(f"     Dialogues: {l2_stats['dialogues']:,}")
    print(f"  üíæ Saved: {OUTPUT_L2}")
    print(f"     Size: {os.path.getsize(OUTPUT_L2) / (1024**2):.1f} MB")
else:
    l2_stats = {'pairs': 0}

---

## üî• Layer 3: NHK News RSS ‚Äî Freshness 2026 (~2M pairs)

Scrape NHK News RSS feeds for modern, natural Japanese.

Focus: **Daily conversation & grammar** (not technical jargon).
NHK uses clean, standard Japanese perfect for IME training.

Sources:
- NHK News (general/main)
- NHK Society (social/daily life)
- NHK Science (science/education)
- NHK Life (lifestyle/culture)
- NHK Entertainment (culture/sports)
- NHK Business (economy/daily)

In [None]:
if RUN_LAYER_3:
    import feedparser
    import requests
    from bs4 import BeautifulSoup
    import time
    
    # ==========================================================
    # RSS Feed Sources (NHK ‚Äî clean daily Japanese)
    # Focus: daily conversation & grammar, NOT technical jargon
    # ==========================================================
    RSS_FEEDS = [
        # NHK = clean, standard Japanese, perfect for IME
        ("NHK Main",          "https://www3.nhk.or.jp/rss/news/cat0.xml"),
        ("NHK Society",       "https://www3.nhk.or.jp/rss/news/cat1.xml"),
        ("NHK Science",       "https://www3.nhk.or.jp/rss/news/cat3.xml"),
        ("NHK Life",          "https://www3.nhk.or.jp/rss/news/cat6.xml"),
        ("NHK Entertainment", "https://www3.nhk.or.jp/rss/news/cat4.xml"),
        ("NHK Business",      "https://www3.nhk.or.jp/rss/news/cat5.xml"),
        ("NHK Sports",        "https://www3.nhk.or.jp/rss/news/cat2.xml"),
        ("NHK Local",         "https://www3.nhk.or.jp/rss/news/cat7.xml"),
    ]
    
    def fetch_article_text(url):
        """Fetch and extract body text from a news article URL."""
        try:
            resp = requests.get(url, timeout=15, headers={
                "User-Agent": "Mozilla/5.0 (IME Dataset Research)"
            })
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            
            # Remove script, style, nav elements
            for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside"]):
                tag.decompose()
            
            # Try common article body selectors
            body = (
                soup.find("article") or
                soup.find("div", class_=re.compile(r"article|entry|content|body", re.I)) or
                soup.find("div", id=re.compile(r"article|entry|content|body", re.I))
            )
            
            if body:
                paragraphs = body.find_all("p")
                text = "
".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
            else:
                text = soup.get_text(separator="
", strip=True)
            
            # Only return if there is substantial Japanese text
            if len(text) > 50 and len(JAPANESE_RE.findall(text)) > 20:
                return text
            return ""
        except:
            return ""
    
    
    # Collect article URLs from all feeds
    print("üì• Layer 3: Fetching NHK RSS feeds...")
    all_articles = []
    
    for name, url in RSS_FEEDS:
        try:
            feed = feedparser.parse(url)
            entries = feed.entries[:100]  # Max 100 per feed
            for entry in entries:
                link = entry.get("link", "")
                title = entry.get("title", "")
                summary = entry.get("summary", "")
                if link:
                    all_articles.append({
                        "url": link,
                        "title": title,
                        "summary": summary,
                        "source": name
                    })
            print(f"  ‚úì {name}: {len(entries)} articles")
        except Exception as e:
            print(f"  ‚ö† {name}: {e}")
    
    # Deduplicate by URL
    seen_urls = set()
    unique_articles = []
    for a in all_articles:
        if a["url"] not in seen_urls:
            seen_urls.add(a["url"])
            unique_articles.append(a)
    
    unique_articles = unique_articles[:LAYER_3_MAX_ARTICLES]
    print(f"  ‚úì Total unique articles: {len(unique_articles):,}")
else:
    print("‚è≠Ô∏è Layer 3 skipped")

In [None]:
if RUN_LAYER_3:
    OUTPUT_L3 = f"{OUTPUT_DIR}/layer3_rss.jsonl"
    PROGRESS_L3 = f"{OUTPUT_DIR}/layer3_progress.json"
    
    # Resume support
    processed_urls = set()
    l3_pairs = 0
    if os.path.exists(PROGRESS_L3):
        with open(PROGRESS_L3, 'r') as f:
            progress = json.load(f)
        processed_urls = set(progress.get('processed_urls', []))
        l3_pairs = progress.get('pairs', 0)
        print(f"üìÇ Resuming: {len(processed_urls)} articles done, {l3_pairs:,} pairs")
    
    l3_stats = {
        'articles': len(processed_urls),
        'sentences': 0,
        'pairs': l3_pairs,
        'errors': 0,
        'empty_articles': 0,
    }
    
    mode = 'a' if l3_pairs > 0 else 'w'
    
    with open(OUTPUT_L3, mode, encoding='utf-8') as out_f:
        pbar = tqdm(unique_articles, desc="üî• RSS Articles")
        
        for article in pbar:
            if article['url'] in processed_urls:
                continue
            
            # First use title + summary (free, no scraping needed)
            text_parts = []
            if article.get('title'):
                text_parts.append(article['title'])
            if article.get('summary'):
                # Clean HTML from summary
                summary_soup = BeautifulSoup(article['summary'], 'html.parser')
                text_parts.append(summary_soup.get_text(strip=True))
            
            # Then try to fetch full article
            full_text = fetch_article_text(article['url'])
            if full_text:
                text_parts.append(full_text)
            else:
                l3_stats['empty_articles'] += 1
            
            combined = '\n'.join(text_parts)
            sentences = extract_sentences(combined)
            prev = ''
            
            for sentence, is_dialogue in sentences:
                l3_stats['sentences'] += 1
                try:
                    pairs = generate_pairs(sentence, prev, ENABLE_SLICING)
                    for p in pairs:
                        p['source'] = 'rss'
                        out_f.write(json.dumps(p, ensure_ascii=False) + '\n')
                    l3_stats['pairs'] += len(pairs)
                    prev = sentence
                except:
                    l3_stats['errors'] += 1
            
            processed_urls.add(article['url'])
            l3_stats['articles'] += 1
            pbar.set_postfix(pairs=f"{l3_stats['pairs']:,}")
            
            # Save progress every 50 articles
            if l3_stats['articles'] % 50 == 0:
                with open(PROGRESS_L3, 'w') as pf:
                    json.dump({
                        'processed_urls': list(processed_urls),
                        'pairs': l3_stats['pairs'],
                    }, pf)
                out_f.flush()
            
            # Rate limit: polite scraping
            time.sleep(1.5)
            
            # Stop if target reached
            if l3_stats['pairs'] >= LAYER_3_TARGET:
                print(f"\n  ‚úì Reached {LAYER_3_TARGET:,} target")
                break
    
    # Final save
    with open(PROGRESS_L3, 'w') as pf:
        json.dump({
            'processed_urls': list(processed_urls),
            'pairs': l3_stats['pairs'],
        }, pf)
    
    print(f"\n  ‚úÖ Layer 3: {l3_stats['pairs']:,} pairs from {l3_stats['articles']} articles")
    print(f"     Sentences: {l3_stats['sentences']:,} | Empty articles: {l3_stats['empty_articles']}")
    print(f"  üíæ Saved: {OUTPUT_L3}")
    if os.path.exists(OUTPUT_L3):
        print(f"     Size: {os.path.getsize(OUTPUT_L3) / (1024**2):.1f} MB")
else:
    l3_stats = {'pairs': 0}

---

## 5. Merge All Layers ‚Üí Final Dataset

In [None]:
import random

FINAL_FILE = f"{OUTPUT_DIR}/ime_dataset_10m.jsonl"

layer_files = []
if RUN_LAYER_1 and os.path.exists(f"{OUTPUT_DIR}/layer1_base.jsonl"):
    layer_files.append(('üß± Base', f"{OUTPUT_DIR}/layer1_base.jsonl"))
if RUN_LAYER_2 and os.path.exists(f"{OUTPUT_DIR}/layer2_narou.jsonl"):
    layer_files.append(('üí¨ Nar≈ç', f"{OUTPUT_DIR}/layer2_narou.jsonl"))
if RUN_LAYER_3 and os.path.exists(f"{OUTPUT_DIR}/layer3_rss.jsonl"):
    layer_files.append(('üî• RSS', f"{OUTPUT_DIR}/layer3_rss.jsonl"))

if not layer_files:
    print("‚ùå No layer files found. Run at least one layer first.")
else:
    print("üì¶ Merging all layers...")
    
    # Count per layer
    layer_counts = {}
    total_lines = 0
    
    with open(FINAL_FILE, 'w', encoding='utf-8') as out_f:
        for name, filepath in layer_files:
            count = 0
            with open(filepath, 'r', encoding='utf-8') as in_f:
                for line in in_f:
                    out_f.write(line)
                    count += 1
            layer_counts[name] = count
            total_lines += count
            print(f"  {name}: {count:,} pairs")
    
    file_size = os.path.getsize(FINAL_FILE)
    
    print(f"\n{'='*60}")
    print(f"üìä FINAL DATASET")
    print(f"{'='*60}")
    print(f"  Total pairs:  {total_lines:,}")
    print(f"  File size:    {file_size / (1024**2):.1f} MB")
    print(f"  File:         {FINAL_FILE}")
    print(f"")
    for name, count in layer_counts.items():
        pct = count / max(total_lines, 1) * 100
        bar = '‚ñà' * int(pct / 2)
        print(f"  {name}: {count:>10,} ({pct:5.1f}%) {bar}")
    
    target_pct = total_lines / TOTAL_TARGET * 100
    print(f"\n  üéØ Target progress: {total_lines:,} / {TOTAL_TARGET:,} ({target_pct:.1f}%)")
    if total_lines < TOTAL_TARGET:
        print(f"     Need {TOTAL_TARGET - total_lines:,} more pairs.")
        print(f"     Options: increase LAYER_1_RAW_LINES or LAYER_2_MAX_NOVELS")

In [None]:
# Show samples from each layer
print("\nüìù Sample Pairs from Each Layer:")
print("=" * 60)

for name, filepath in layer_files:
    print(f"\n{name}:")
    samples = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 100:
                break
            samples.append(json.loads(line))
    
    # Pick 5 random diverse samples
    random.seed(42)
    picks = random.sample(samples, min(5, len(samples)))
    for s in picks:
        ctx = s.get('left_context', '')[:12] or ''
        print(f"  {ctx}<SEP>{s['input'][:15]} ‚Üí {s['output'][:25]}")

In [None]:
# Quality spot-check
print("\nüîç Quality Spot-Check (random 10):")
print("-" * 60)

spot_samples = []
with open(FINAL_FILE, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 200:
            break
        spot_samples.append(json.loads(line))

random.seed(123)
for s in random.sample(spot_samples, min(10, len(spot_samples))):
    # Verify kana conversion
    re_kana = text_to_kana(s['output'])
    match = re_kana == s['input']
    status = '‚úÖ' if match else '‚ö†Ô∏è'
    print(f"  {status} {s['input'][:20]} ‚Üí {s['output'][:25]}")

In [None]:
# How to use in training
print("\n" + "="*60)
print("üìã HOW TO USE IN TRAINING")
print("="*60)
print(f"""
Option A: Use as standalone dataset
  DATASET_FILE = "{FINAL_FILE}"

Option B: Combine with filtered zenz
  files = [
      "{OUTPUT_DIR}/../filtered_data/filtered_high_quality.jsonl",
      "{FINAL_FILE}",
  ]

Option C: Layered fine-tuning (RECOMMENDED)
  Step 1: Train on OSCAR base (layer1) ‚Äî full epochs
  Step 2: Continue on Nar≈ç (layer2) ‚Äî 3-5 epochs  
  Step 3: Fine-tune on RSS (layer3) ‚Äî 2-3 epochs
  This gives grammar + conversation + 2026 freshness!

Individual layer files:
  - {OUTPUT_DIR}/layer1_base.jsonl  (grammar base)
  - {OUTPUT_DIR}/layer2_narou.jsonl  (dialogue/conversation)
  - {OUTPUT_DIR}/layer3_rss.jsonl    (2026 vocabulary)
""")

In [None]:
# Memory cleanup
import gc
gc.collect()

print("\nüßπ Cleanup:")
print("  Delete progress files after verifying:")
for f in [
    f"{OUTPUT_DIR}/layer1_base.jsonl.progress",
    f"{OUTPUT_DIR}/layer2_progress.json",
    f"{OUTPUT_DIR}/layer3_progress.json",
]:
    if os.path.exists(f):
        print(f"    {f}")

print("\n‚úÖ Done! Dataset ready for training.")