# üîç Dataset Quality Filter for Japanese Keyboard IME

Filter the zenz dataset to keep high-quality, daily conversation text.

**3 Filter Layers:**
1. **Kill List** ‚Äî Remove toxic/useless patterns (forum artifacts, URLs in kana, Cyrillic, etc.)
2. **Fix List** ‚Äî Clean/normalize data (bad left_context, cut-off sentences)
3. **Quality Score** ‚Äî Score remaining items for daily conversation relevance

**Output:** Filtered high-quality JSONL in Google Drive

In [None]:
import os

# Auto-detect platform
if 'COLAB_RELEASE_TAG' in os.environ:
    PLATFORM = 'Colab'
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
elif os.path.exists('/kaggle/working'):
    PLATFORM = 'Kaggle'
    DRIVE_DIR = '/kaggle/working'
else:
    PLATFORM = 'Local'
    DRIVE_DIR = './output'

OUTPUT_DIR = f"{DRIVE_DIR}/filtered_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"‚úÖ Platform: {PLATFORM}")
print(f"üìÅ Output: {OUTPUT_DIR}")

In [None]:
!pip install -q datasets tqdm

In [None]:
# ===========================================================
# CONFIGURATION
# ===========================================================

# Processing limits
MAX_ITEMS = None  # None = ALL items, or e.g. 100_000 for testing

# Quality threshold (0-10 score)
MIN_QUALITY_SCORE = 4  # Keep items scoring >= this

print(f"Min quality score: {MIN_QUALITY_SCORE}")
print(f"Items limit: {'ALL' if MAX_ITEMS is None else f'{MAX_ITEMS:,}'}")

## 1. Kill List ‚Äî Must Remove

These patterns destroy keyboard quality. Instant reject, no scoring needed.

In [None]:
import re

# ==========================================================
# KILL LIST: Patterns that MUST be removed
# ==========================================================

# A. Forum / Scraping Artifacts
FORUM_PATTERNS = re.compile(
    r'(ID:|„Ç¢„Ç§„Éá„Ç£:|ÊäïÁ®øÊó•|ÊäïÁ®ø:|ÂêçÂâç:|ÂêçÁÑ°„Åó|'
    r'\d{2}:\d{2}:\d{2}|'           # Timestamps 23:06:08
    r'\d{4}/\d{1,2}/\d{1,2}|'       # Dates 2010/12/15
    r'\(\w\)\s*\d{2}:\d{2}|'        # Forum date format („Ç´) 10:55
    r'„Éä„Éû„Ç®„Éä„Ç§„É®|'                  # Anonymous username
    r'„É¨„ÇπÊï∞|„Çπ„É¨„ÉÉ„Éâ|'               # Thread/reply terms
    r'^##\s|^‚ñ†|^‚óè|^‚óÜ)',             # Markdown/structured headers
    re.MULTILINE
)

# B. URLs spelled in Kana (useless for IME)
KANA_URL_PATTERNS = re.compile(
    r'(„Ç®„Ç§„ÉÅ„ÉÜ„Ç£„Éº„ÉÜ„Ç£„Éº„Éî„Éº|'       # http
    r'„ÉÄ„Éñ„É™„É•„ÉÄ„Éñ„É™„É•|'              # www
    r'„Éâ„ÉÉ„Éà„Ç≥„É†|„Éâ„ÉÉ„Éà„Ç∏„Çß„Éº„Éî„Éº|'    # .com, .jp
    r'„É¶„Éº„Ç¢„Éº„É´„Ç®„É´|'                # URL
    r'http[s]?://|'                   # Actual URLs in output
    r'www\.)',
    re.IGNORECASE
)

# C. Foreign Script Garbage (Cyrillic, extended Latin, etc.)
#    Keep basic ASCII (English is popular in Japanese!)
GARBAGE_CHARS = re.compile(
    r'[\u0400-\u04FF'     # Cyrillic
    r'\u0370-\u03FF'      # Greek (except common math)
    r'\u0100-\u024F'      # Extended Latin (√Ö, √∂, etc.)
    r'\u0250-\u02AF'      # IPA extensions
    r'\u2600-\u26FF'      # Misc symbols (but NOT emoji)
    r']'
)

# D. ASCII art / broken encoding patterns
ASCII_ART_RE = re.compile(
    r'([‚îÄ‚îÅ‚îÇ‚îÉ‚îå‚îê‚îî‚îò‚îú‚î§‚î¨‚î¥‚îº]{3,}|'   # Box drawing
    r'[=\-]{5,}|'                 # Long separators
    r'[\*]{3,}|'                  # Stars
    r'[_]{5,})',                   # Underscores
)

# E. Punctuation-only output (zero learning value)
PUNCT_ONLY_RE = re.compile(r'^[„ÄÅ„ÄÇÔºüÔºÅ?!\s\d.,;:()ÔºàÔºâ„Äå„Äç„Äé„Äè„Äê„Äë„Éª]+$')


def is_killed(item):
    """Check if item should be killed (removed). Returns (True, reason) or (False, None)."""
    inp = item.get('input', '') or ''
    out = item.get('output', '') or ''
    ctx = item.get('left_context', '') or ''
    full = ctx + out
    
    # Empty
    if not inp or not out:
        return True, 'empty'
    
    # Forum artifacts
    if FORUM_PATTERNS.search(full):
        return True, 'forum_artifact'
    
    # Kana URLs
    if KANA_URL_PATTERNS.search(inp) or KANA_URL_PATTERNS.search(out):
        return True, 'kana_url'
    
    # Foreign script garbage (NOT basic ASCII ‚Äî English is OK!)
    if GARBAGE_CHARS.search(out):
        return True, 'garbage_chars'
    
    # ASCII art / broken encoding
    if ASCII_ART_RE.search(out):
        return True, 'ascii_art'
    
    # Punctuation-only output
    if PUNCT_ONLY_RE.match(out):
        return True, 'punct_only'
    
    return False, None


# Quick test
print("üß™ Kill List test:")
kill_tests = [
    {'input': '„Éä„Éû„Ç®', 'output': 'ÂêçÂâç: „ÉÜ„Çπ„Éà ID:123', 'left_context': ''},
    {'input': '„Ç®„Ç§„ÉÅ„ÉÜ„Ç£„Éº„ÉÜ„Ç£„Éº„Éî„Éº„Ç®„Çπ', 'output': 'https://example.com', 'left_context': ''},
    {'input': '„ÉÜ„Çπ„Éà', 'output': '–Ö–ú–ê–Ø', 'left_context': ''},
    {'input': '„ÉÜ„É≥', 'output': '„ÄÅ', 'left_context': ''},
    {'input': '„Åç„Çá„ÅÜ', 'output': '‰ªäÊó•„ÅØ„ÅÑ„ÅÑÂ§©Ê∞ó„Åß„Åô„Å≠', 'left_context': ''},
    {'input': '„Ç¢„Éó„É™', 'output': 'app„Çí‰Ωø„Å£„Å¶', 'left_context': ''},  # English OK!
]
for t in kill_tests:
    killed, reason = is_killed(t)
    status = f'‚ùå KILL ({reason})' if killed else '‚úÖ KEEP'
    print(f"  {status}: {t['input']} ‚Üí {t['output'][:30]}")

## 2. Fix List ‚Äî Clean & Normalize

Don't delete these ‚Äî fix them to make the IME smarter.

In [None]:
# ==========================================================
# FIX LIST: Clean/normalize data
# ==========================================================

# Bad left_context patterns (set to empty)
BAD_CONTEXT_PATTERNS = re.compile(
    r'(^ID:|^„Ç¢„Ç§„Éá„Ç£:|^ÊäïÁ®øÊó•|^ÂêçÂâç:|'
    r'^\d{4}[/\-]\d{1,2}[/\-]\d{1,2}|'   # Date-only context (2023/10/01)
    r'^\d{2}:\d{2}:\d{2}|'                  # Time-only context
    r'http[s]?://|'                          # URL context
    r'^[A-Z]{5,}$)',                         # All-caps codes
    re.IGNORECASE
)

# Particles that can't start a sentence without context
STARTING_PARTICLES = {'„Å¶', '„Å´', '„Çí', '„ÅØ', '„Åå', '„Åß', '„Å®', '„ÇÇ',
                       '„Å∏', '„Åã„Çâ', '„Åæ„Åß', '„Çà„Çä', '„ÅÆ„Åß', '„ÅÆ„Å´',
                       '„Åë„Å©', '„Åë„Çå„Å©', '„Å™„Åå„Çâ'}


def fix_item(item):
    """Clean and normalize an item. Returns fixed item."""
    result = {
        'input': item.get('input', '') or '',
        'output': item.get('output', '') or '',
        'left_context': item.get('left_context', '') or '',
    }
    
    ctx = result['left_context']
    out = result['output']
    
    # Fix 1: Clean bad left_context ‚Üí set to empty
    if ctx and BAD_CONTEXT_PATTERNS.search(ctx):
        result['left_context'] = ''
        result['_fixed'] = 'bad_context_cleared'
    
    # Fix 2: Cut-off sentences ‚Äî output starts with particle but no context
    #         "„Å¶Á∂ôÁ∂ö„Åó„Å¶" with no context ‚Üí meaningless fragment
    if not result['left_context']:
        first_chars = out[:3]
        if any(first_chars.startswith(p) for p in STARTING_PARTICLES):
            # This is a cut-off fragment without context
            result['_cutoff'] = True
    
    # Fix 3: Strip whitespace
    result['left_context'] = result['left_context'].strip()
    result['output'] = result['output'].strip()
    result['input'] = result['input'].strip()
    
    return result


# Quick test
print("üß™ Fix List test:")
fix_tests = [
    {'input': '„Ç¢„É°', 'output': 'Èõ®„Å†„Å£„Åü', 'left_context': '2023/10/01'},
    {'input': '„Ç±„Ç§„Çæ„ÇØ', 'output': '„Å¶Á∂ôÁ∂ö„Åó„Å¶', 'left_context': ''},
    {'input': '„Ç¢„É°', 'output': 'Èõ®„Å†„Å£„Åü', 'left_context': 'Êò®Êó•„ÅØ'},
]
for t in fix_tests:
    fixed = fix_item(t)
    ctx_before = t['left_context'] or '(none)'
    ctx_after = fixed['left_context'] or '(none)'
    flags = []
    if fixed.get('_fixed'): flags.append(f"üîß {fixed['_fixed']}")
    if fixed.get('_cutoff'): flags.append('‚ö†Ô∏è cutoff')
    flag_str = ' '.join(flags) if flags else '‚úÖ OK'
    print(f"  {flag_str}: ctx [{ctx_before}] ‚Üí [{ctx_after}] | {fixed['output'][:20]}")

## 3. Quality Score ‚Äî Daily Conversation Relevance

Score 0-10 based on how useful this item is for keyboard IME.

In [None]:
from collections import Counter

# ==========================================================
# Daily vocabulary (for positive scoring)
# ==========================================================

# Common daily verbs (stem forms to match conjugations)
DAILY_VERBS = {
    '„Åô„Çã', '„Åó', '„Åï', '„ÅÑ„Çã', '„ÅÑ', '„ÅÇ„Çã', '„ÅÇ„Å£', '„Å™„Çã', '„Å™„Å£',
    '„Åß„Åç„Çã', '„Åß„Åç', '„ÇÑ„Çã', '„ÇÑ„Å£',
    'Ë°å„Åè', 'Ë°å„Åç', 'Ë°å„Å£', 'Êù•„Çã', 'Êù•„Åü', 'Â∏∞„Çã', 'Â∏∞„Å£',
    'Ë¶ã„Çã', 'Ë¶ã„Åü', 'Ë¶ã„Åà', 'ËÅû„Åè', 'ËÅû„ÅÑ', 'Ë™≠„ÇÄ', 'Ë™≠„Çì',
    'È£ü„Åπ', 'È£≤„ÇÄ', 'È£≤„Çì', 'Ë≤∑„ÅÜ', 'Ë≤∑„Å£', '‰Ωú„Çã', '‰Ωú„Å£',
    '‰Ωø„ÅÜ', '‰Ωø„Å£', 'ÊåÅ„Å§', 'ÊåÅ„Å£', 'ÂÖ•„Çã', 'ÂÖ•„Å£', 'Âá∫„Çã', 'Âá∫„Åó',
    'ÊÄù„ÅÜ', 'ÊÄù„Å£', 'ËÄÉ„Åà', 'Áü•„Çã', 'Áü•„Å£', 'ÂàÜ„Åã„Çã', 'ÂàÜ„Åã„Å£',
    'Ë®Ä„ÅÜ', 'Ë®Ä„Å£', 'Ë©±„Åô', 'Ë©±„Åó', 'Êõ∏„Åè', 'Êõ∏„ÅÑ', 'Êïô„Åà',
    '‰Ωè„ÇÄ', '‰Ωè„Çì', 'ÂÉç„Åè', 'ÂÉç„ÅÑ', 'Âßã„ÇÅ', 'ÁµÇ„Çè',
    'ÂæÖ„Å§', 'ÂæÖ„Å£', 'ÈÄÅ„Çã', 'ÈÄÅ„Å£', 'Âèó„Åë', 'Âèñ„Çã', 'Âèñ„Å£',
    'ÈÅä„Å∂', 'ÈÅä„Çì', 'Ëµ∞„Çã', 'Ëµ∞„Å£', 'Ê≠©„Åè', 'Ê≠©„ÅÑ',
    'Ëµ∑„Åç', 'ÂØù„Çã', 'ÂØù„Åü', 'Èñã„Åè', 'Èñã„ÅÑ', 'Èñâ„ÇÅ',
    'Â§â„Çè„Çã', 'Â§â„Çè„Å£', 'Ê±∫„ÇÅ', 'ÈÅ∏„Å∂', 'ÈÅ∏„Çì',
    '‰ºö„ÅÜ', '‰ºö„Å£', 'Âëº„Å∂', 'Âëº„Çì', 'Êâï„ÅÜ', 'Êâï„Å£',
    'Á´ã„Å§', 'Á´ã„Å£', 'Â∫ß„Çã', 'Â∫ß„Å£', 'ÁΩÆ„Åè', 'ÁΩÆ„ÅÑ',
    'Â•Ω„Åç', 'Â´å„ÅÑ', 'Ê¨≤„Åó„ÅÑ', 'Ê•Ω„Åó„ÅÑ', 'Â¨â„Åó„ÅÑ',
    'ÂøÖË¶Å', 'Â§ßÂàá', 'Â§ß‰∫ã', 'Á∞°Âçò', 'Èõ£„Åó„ÅÑ',
}

# Common daily nouns
DAILY_NOUNS = {
    '‰∫∫', '‰∫ã', 'Áâ©', 'ÊâÄ', 'Êñπ', 'ÊôÇ', 'Êó•', 'Âπ¥', 'Êúà', 'Ââç',
    'Âæå', '‰∏≠', '‰∏ä', '‰∏ã', 'Èñì', 'Ê∞ó', 'ÁõÆ', 'Êâã', 'È†≠', 'ÂøÉ',
    'ÂêçÂâç', '‰ªï‰∫ã', 'Â≠¶Ê†°', '‰ºöÁ§æ', 'ÂÆ∂', 'ÈÉ®Â±ã', 'ÈõªËªä', 'ÈßÖ',
    'ÂèãÈÅî', 'ÂÆ∂Êóè', 'Â≠ê‰æõ', 'ÂÖàÁîü', 'Â≠¶Áîü', 'ÂΩº', 'ÂΩºÂ•≥', 'Ëá™ÂàÜ',
    'Êúù', 'Êòº', 'Â§ú', '‰ªäÊó•', 'ÊòéÊó•', 'Êò®Êó•', '‰ªä', 'ÈÄ±Êú´',
    'Â§©Ê∞ó', 'Èõ®', 'Èõ™', 'Êò•', 'Â§è', 'Áßã', 'ÂÜ¨',
    'È£ü‰∫ã', 'ÊñôÁêÜ', 'Ê∞¥', '„ÅäËå∂', '„Ç≥„Éº„Éí„Éº', '„ÅîÈ£Ø',
    'ÈõªË©±', '„É°„Éº„É´', 'ÂÜôÁúü', 'Êò†Áîª', 'Èü≥Ê•Ω', 'Êú¨', '„Ç≤„Éº„É†',
    'Â∫ó', 'ÁóÖÈô¢', 'ÂÖ¨Âúí', 'ÈÅì', 'Áî∫', 'ÂõΩ', 'Â†¥ÊâÄ',
    'ÂïèÈ°å', 'Ë≥™Âïè', 'Á≠î„Åà', 'ÊÑèÂë≥', 'ÁêÜÁî±', 'ÁµêÊûú',
    'Ë©±', 'Ë®ÄËëâ', 'Â£∞', '‰Ωì', 'È°î', 'Âè£',
    'Ëªä', 'Ëä±', 'Áå´', 'Áä¨', 'Á©∫', 'Êµ∑', 'Â±±',
    '„ÅäÈáë', 'ÊôÇÈñì', 'Ë∂£Âë≥', 'ÁµåÈ®ì', 'Ê∞óÊåÅ„Å°', 'ÁîüÊ¥ª',
    '„Çπ„Éû„Éõ', '„Ç¢„Éó„É™', '„Éó„É≠„Ç∞„É©„É†', '„Éá„Éº„Çø', '„Çµ„Ç§„Éà',
}

# Polite/conversational markers
POLITE_MARKERS = {
    '„Åß„Åô', '„Åæ„Åô', '„Åß„Åó„Åü', '„Åæ„Åó„Åü', '„Åæ„Åõ„Çì', '„Åè„Å†„Åï„ÅÑ',
    '„Åß„Åó„Çá„ÅÜ', '„Åæ„Åó„Çá„ÅÜ', '„Åß„Åô„Åã', '„Åæ„Åô„Åã',
    '„Å†', '„Å†„Å£„Åü', '„Å†„Çç„ÅÜ', '„Åã„Å™', '„Çà„Å≠', '„Å≠', '„Çà',
    '„Åë„Å©', '„Åë„Çå„Å©', '„ÅÆ„Åß', '„Åã„Çâ', '„Åü„Çâ', '„Å¶„ÇÇ',
    '„Å™„Çä„Åæ„Åó„Åü', '„ÅÇ„Çä„Åå„Å®„ÅÜ', '„Åô„Åø„Åæ„Åõ„Çì', '„ÅäÈ°ò„ÅÑ',
}

# Encyclopedia patterns (reduce score)
ENCYCLOPEDIA_PATTERNS = re.compile(
    r'(„Å´‰ΩçÁΩÆ„Åô„Çã|„Å´ÊâÄÂú®„Åô„Çã|'          # Location descriptions
    r'ÂåóÁ∑Ø\d+|ÂçóÁ∑Ø\d+|Êù±Áµå\d+|Ë•øÁµå\d+|'  # Coordinates
    r'ISBN|ISSN|'                        # Book codes
    r'Á¥ÄÂÖÉÂâç\d|'                         # BC dates
    r'‰∫∫Âè£„ÅØÁ¥Ñ?\d|Èù¢Á©ç„ÅØÁ¥Ñ?\d|'          # Population/area stats
    r'Ê®ôÈ´ò\d|Êµ∑Êäú\d|'                    # Elevation
    r'Â≠¶Âêç|ÂàÜÈ°ûÂ≠¶|'                      # Scientific taxonomy
    r'Á¨¨\d+‰ª£|Á¨¨\d+Âõû|'                  # Ordinal titles
    r'Êù°Á¥Ñ|ÂãÖ‰ª§|Ê≥ï‰ª§)',                   # Legal terms
)

# Number density pattern
NUMBER_RE = re.compile(r'\d+')


def score_quality(item):
    """Score an item 0-10 for daily conversation relevance.
    
    Note: item should already be passed through fix_item().
    Returns: (score, reasons_list)
    """
    ctx = item.get('left_context', '') or ''
    inp = item.get('input', '') or ''
    out = item.get('output', '') or ''
    full_text = ctx + out
    
    score = 5  # Start neutral
    reasons = []
    
    # --- NEGATIVE SIGNALS ---
    
    # Cut-off fragment (particle start without context)
    if item.get('_cutoff'):
        score -= 2
        reasons.append('cutoff_fragment')
    
    # Very long text (likely encyclopedia paragraphs)
    if len(full_text) > 140:
        score -= 1
        reasons.append('very_long')
    
    # Encyclopedia patterns
    if ENCYCLOPEDIA_PATTERNS.search(full_text):
        score -= 2
        reasons.append('encyclopedia')
    
    # High number density (3+ number groups = statistics)
    numbers = NUMBER_RE.findall(full_text)
    if len(numbers) >= 3:
        score -= 2
        reasons.append(f'many_numbers({len(numbers)})')
    elif len(numbers) >= 2:
        score -= 1
        reasons.append(f'numbers({len(numbers)})')
    
    # Long consecutive numbers (serial codes, etc.)
    if re.search(r'\d{5,}', full_text):
        score -= 2
        reasons.append('long_number')
    
    # Many brackets (technical notation)
    brackets = sum(full_text.count(c) for c in '(Ôºà[„Äê')
    if brackets >= 3:
        score -= 1
        reasons.append(f'brackets({brackets})')
    
    # --- POSITIVE SIGNALS ---
    
    # Contains daily verbs
    daily_verb_count = sum(1 for v in DAILY_VERBS if v in full_text)
    if daily_verb_count >= 2:
        score += 2
        reasons.append(f'daily_verbs({daily_verb_count})')
    elif daily_verb_count >= 1:
        score += 1
        reasons.append('daily_verb')
    
    # Contains daily nouns
    daily_noun_count = sum(1 for n in DAILY_NOUNS if n in full_text)
    if daily_noun_count >= 2:
        score += 1
        reasons.append(f'daily_nouns({daily_noun_count})')
    
    # Polite/conversational markers
    polite_count = sum(1 for p in POLITE_MARKERS if p in full_text)
    if polite_count >= 1:
        score += 1
        reasons.append(f'polite({polite_count})')
    
    # Natural sentence ending
    if full_text and full_text[-1] in '„ÄÇÔºÅÔºü„Å≠„ÄÅ„Çà„Åã':
        score += 1
        reasons.append('natural_end')
    
    # Good output length (natural phrase)
    if 3 <= len(out) <= 50:
        score += 1
        reasons.append('good_out_len')
    
    # Has meaningful left_context (provides training signal)
    if len(ctx) >= 2:
        score += 1
        reasons.append('has_context')
    
    # Mixed script (natural Japanese + katakana/English)
    has_hiragana = bool(re.search(r'[\u3040-\u309F]', full_text))
    has_katakana = bool(re.search(r'[\u30A0-\u30FF]', full_text))
    has_kanji = bool(re.search(r'[\u4E00-\u9FFF]', full_text))
    if has_hiragana and has_kanji:
        score += 1
        reasons.append('mixed_script')
    
    # Clamp to 0-10
    score = max(0, min(10, score))
    
    return score, reasons


# Quick test
print("üß™ Quality Score test:")
score_tests = [
    {'input': '„Åç„Çá„ÅÜ', 'output': '‰ªäÊó•„ÅØ„ÅÑ„ÅÑÂ§©Ê∞ó„Åß„Åô„Å≠', 'left_context': ''},
    {'input': '„Åª„Åè„ÅÑ', 'output': 'ÂåóÁ∑Ø35Â∫¶12ÂàÜ„Å´‰ΩçÁΩÆ„Åô„Çã', 'left_context': ''},
    {'input': '„ÅÑ„Å£„Åó„Çá', 'output': '‰∏ÄÁ∑í„Å´Êò†Áîª„ÇíË¶ã„Å´Ë°å„Åç„Åæ„Åó„Åü', 'left_context': 'ÂèãÈÅî„Å®'},
    {'input': '„Åó', 'output': '„Å¶Á∂ôÁ∂ö„Åó„Å¶', 'left_context': '', '_cutoff': True},
    {'input': '„Ç¢„Éó„É™', 'output': 'app„Çí‰Ωø„Å£„Å¶Ë¶ã„Çã', 'left_context': '„Çπ„Éû„Éõ„ÅÆ'},
    {'input': '„Å†„ÅÑ', 'output': 'Á¨¨35‰ª£Â§ßÁµ±È†ò„ÅÆ', 'left_context': ''},
    {'input': '„ÅÇ„Å§', 'output': 'Êöë„ÅÑ', 'left_context': '‰ªäÊó•„ÅØ„Å®„Å¶„ÇÇ'},  # Short output OK!
]
for t in score_tests:
    s, r = score_quality(t)
    status = '‚úÖ' if s >= MIN_QUALITY_SCORE else '‚ùå'
    ctx = t.get('left_context', '')[:10] or ''
    print(f"  {status} Score {s:2d}/10 | {ctx}<SEP>{t['input']} ‚Üí {t['output'][:25]}")
    print(f"              Reasons: {', '.join(r)}")

## 4. Load Dataset & Process

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import json

print("üì• Loading zenz dataset...")
dataset = load_dataset(
    "Miwa-Keita/zenz-v2.5-dataset",
    data_files="train_wikipedia.jsonl",
    split="train"
)
total = len(dataset)
print(f"‚úì Loaded {total:,} items")

In [None]:
# ============================================================
# Full pipeline: Kill ‚Üí Fix ‚Üí Score ‚Üí Save
# ============================================================

FILTERED_FILE = f"{OUTPUT_DIR}/filtered_high_quality.jsonl"
KILLED_FILE = f"{OUTPUT_DIR}/killed_samples.jsonl"  # Sample of removed items
PROGRESS_FILE = f"{OUTPUT_DIR}/filter_progress.json"

# Stats
stats = {
    'total': 0,
    'killed': Counter(),       # Reason ‚Üí count
    'score_dist': Counter(),   # Score ‚Üí count
    'kept': 0,
    'fixed': Counter(),        # Fix type ‚Üí count
}

# Resume support
start_from = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, 'r') as f:
        progress = json.load(f)
    start_from = progress.get('processed', 0)
    stats['kept'] = progress.get('kept', 0)
    stats['killed'] = Counter(progress.get('killed', {}))
    stats['score_dist'] = Counter({int(k): v for k, v in progress.get('score_dist', {}).items()})
    stats['fixed'] = Counter(progress.get('fixed', {}))
    print(f"üìÇ Resuming from item {start_from:,}")

limit = min(total, MAX_ITEMS) if MAX_ITEMS else total

# Keep some killed samples for inspection
killed_samples = []
MAX_KILLED_SAMPLES = 100

# Process
mode = 'a' if start_from > 0 else 'w'
with open(FILTERED_FILE, mode, encoding='utf-8') as out_f:
    for idx in tqdm(range(start_from, limit), desc="Filtering", total=limit - start_from):
        item = dataset[idx]
        stats['total'] += 1
        
        # Step 1: KILL LIST ‚Äî instant reject
        killed, kill_reason = is_killed(item)
        if killed:
            stats['killed'][kill_reason] += 1
            if len(killed_samples) < MAX_KILLED_SAMPLES:
                killed_samples.append({
                    'reason': kill_reason,
                    'input': item.get('input', '')[:30],
                    'output': (item.get('output', '') or '')[:50],
                })
            continue
        
        # Step 2: FIX LIST ‚Äî clean/normalize
        fixed = fix_item(item)
        if fixed.get('_fixed'):
            stats['fixed'][fixed['_fixed']] += 1
        if fixed.get('_cutoff'):
            stats['fixed']['cutoff'] += 1
        
        # Step 3: QUALITY SCORE
        score, reasons = score_quality(fixed)
        stats['score_dist'][score] += 1
        
        if score >= MIN_QUALITY_SCORE:
            out_item = {
                'left_context': fixed['left_context'],
                'input': fixed['input'],
                'output': fixed['output'],
                'score': score,
            }
            out_f.write(json.dumps(out_item, ensure_ascii=False) + '\n')
            stats['kept'] += 1
        
        # Save progress every 100K items
        if stats['total'] % 100_000 == 0:
            with open(PROGRESS_FILE, 'w') as pf:
                json.dump({
                    'processed': start_from + stats['total'],
                    'kept': stats['kept'],
                    'killed': dict(stats['killed']),
                    'score_dist': dict(stats['score_dist']),
                    'fixed': dict(stats['fixed']),
                }, pf)
            out_f.flush()

# Save final progress
with open(PROGRESS_FILE, 'w') as pf:
    json.dump({
        'processed': start_from + stats['total'],
        'kept': stats['kept'],
        'killed': dict(stats['killed']),
        'score_dist': dict(stats['score_dist']),
        'fixed': dict(stats['fixed']),
    }, pf)

# Save killed samples
with open(KILLED_FILE, 'w', encoding='utf-8') as f:
    for s in killed_samples:
        f.write(json.dumps(s, ensure_ascii=False) + '\n')

total_processed = start_from + stats['total']
total_killed = sum(stats['killed'].values())

print(f"\n‚úì Processed: {total_processed:,}")
print(f"üóëÔ∏è Killed: {total_killed:,}")
print(f"‚úÖ Kept: {stats['kept']:,} ({stats['kept']/total_processed*100:.1f}%)")
print(f"üíæ Saved: {FILTERED_FILE}")

## 5. Results & Statistics

In [None]:
total_processed = start_from + stats['total']
total_killed = sum(stats['killed'].values())
total_scored = total_processed - total_killed

print("="*60)
print("üìä FILTERING REPORT")
print("="*60)

# Kill List breakdown
print(f"\nüóëÔ∏è KILL LIST ({total_killed:,} removed):")
print("-" * 40)
for reason, count in stats['killed'].most_common():
    pct = count / total_processed * 100
    print(f"  {reason:<20} {count:>8,} ({pct:.1f}%)")

# Fix List breakdown
total_fixed = sum(stats['fixed'].values())
print(f"\nüîß FIX LIST ({total_fixed:,} items cleaned):")
print("-" * 40)
for fix_type, count in stats['fixed'].most_common():
    print(f"  {fix_type:<20} {count:>8,}")

# Score distribution
print(f"\nüìà SCORE DISTRIBUTION (of {total_scored:,} non-killed items):")
print("-" * 55)
for score in sorted(stats['score_dist'].keys()):
    count = stats['score_dist'][score]
    pct = count / max(total_scored, 1) * 100
    bar = '‚ñà' * int(pct / 2)
    status = '‚úÖ KEEP' if score >= MIN_QUALITY_SCORE else '‚ùå DROP'
    print(f"  Score {score:2d}: {count:>8,} ({pct:5.1f}%) {bar} {status}")

# Summary
print(f"\n{'='*60}")
print(f"üìã SUMMARY")
print(f"  Original:     {total_processed:>10,}")
print(f"  Killed:       {total_killed:>10,} ({total_killed/total_processed*100:.1f}%)")
print(f"  Score < {MIN_QUALITY_SCORE}:    {total_scored - stats['kept']:>10,}")
print(f"  ‚úÖ Kept:      {stats['kept']:>10,} ({stats['kept']/total_processed*100:.1f}%)")

file_size = os.path.getsize(FILTERED_FILE)
print(f"\n  File: {FILTERED_FILE}")
print(f"  Size: {file_size / (1024*1024):.1f} MB")

In [None]:
# Show killed samples for verification
print("\nüóëÔ∏è Sample KILLED items (verify these should be removed):")
print("-" * 60)
# Group by reason
by_reason = {}
for s in killed_samples:
    r = s['reason']
    if r not in by_reason:
        by_reason[r] = []
    if len(by_reason[r]) < 3:  # 3 samples per kill reason
        by_reason[r].append(s)

for reason, samples in by_reason.items():
    print(f"\n  [{reason}]")
    for s in samples:
        print(f"    {s['input']} ‚Üí {s['output'][:40]}")

In [None]:
# Show kept samples for verification
print("\n‚úÖ Sample KEPT items (verify these are good quality):")
print("-" * 60)

kept_samples = []
with open(FILTERED_FILE, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 30:
            break
        kept_samples.append(json.loads(line))

for item in kept_samples[:15]:
    ctx = item['left_context'][:12] or ''
    print(f"  s={item['score']} | {ctx}<SEP>{item['input'][:10]} ‚Üí {item['output'][:30]}")

In [None]:
# How to use in training notebooks
print("\n" + "="*60)
print("üìã HOW TO USE IN TRAINING NOTEBOOKS")
print("="*60)
print(f"""
Replace dataset loading with:

  import json
  
  FILTERED_FILE = "{FILTERED_FILE}"
  
  dataset = []
  with open(FILTERED_FILE, 'r', encoding='utf-8') as f:
      for line in f:
          dataset.append(json.loads(line))
  
  print(f"Loaded {{len(dataset):,}} filtered items")
  
  # Same fields as before:
  for item in dataset:
      left_ctx = item['left_context']
      kana = item['input']
      kanji = item['output']
""")