# üì¶ Build Custom Japanese IME Dataset

Create a high-quality conversational dataset for keyboard IME training.

**Pipeline:**
1. Load text from free sources (CC-100, OSCAR, custom text)
2. Extract conversational sentences (dialogue in `„Äå„Äç`, casual speech)
3. Reverse-kana pipeline: `kanji text ‚Üí kana reading` via SudachiPy
4. Data augmentation: slice into NWP + KKC training pairs
5. Quality filter + save to Drive as JSONL

**Output format (same as zenz):**
```json
{"left_context": "Ââç„ÅÆÊñá", "input": "„Ç´„Éä", "output": "Êº¢Â≠ó"}
```

In [None]:
import os

# Auto-detect platform
if 'COLAB_RELEASE_TAG' in os.environ:
    PLATFORM = 'Colab'
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
elif os.path.exists('/kaggle/working'):
    PLATFORM = 'Kaggle'
    DRIVE_DIR = '/kaggle/working'
else:
    PLATFORM = 'Local'
    DRIVE_DIR = './output'

OUTPUT_DIR = f"{DRIVE_DIR}/datasets"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"‚úÖ Platform: {PLATFORM}")
print(f"üìÅ Output: {OUTPUT_DIR}")

In [None]:
# SudachiPy with full dictionary (best for modern words + readings)
!pip install -q datasets tqdm sudachipy sudachidict_full

In [None]:
# ===========================================================
# CONFIGURATION
# ===========================================================

# --- Data Source ---
# Options: 'cc100', 'oscar', 'mc4', 'custom'
# cc100: Cleaned web text (good grammar base)
# oscar: Web crawl (diverse, noisy)
# mc4:   Cleaned multilingual C4 (high quality)
# custom: Your own text lines (paste or load from file)
DATA_SOURCE = 'cc100'

# --- Processing limits ---
MAX_RAW_LINES = 500_000     # Raw lines to stream from source
MAX_OUTPUT_PAIRS = 2_000_000 # Max training pairs to generate

# --- Quality filters ---
MIN_SENTENCE_LEN = 5    # Min chars per sentence
MAX_SENTENCE_LEN = 120  # Max chars (skip long paragraphs)
PREFER_DIALOGUE = True  # Boost sentences from „Äå„Äç dialogue

# --- Augmentation ---
# Create sliced pairs for NWP (Next Word Prediction)
ENABLE_SLICING = True   # Slice sentences into sub-pairs
MIN_SLICE_WORDS = 2     # Min words in a slice

print(f"Source: {DATA_SOURCE}")
print(f"Max lines: {MAX_RAW_LINES:,}")
print(f"Max output pairs: {MAX_OUTPUT_PAIRS:,}")

## 1. Reverse-Kana Pipeline (SudachiPy)

Converts kanji/mixed text ‚Üí katakana readings.

`2026Âπ¥„ÅÆÊñ∞‰Ωú„Ç≤„Éº„É†„ÅØÊúÄÈ´ò„Å†„ÄÇ` ‚Üí `„Éã„Çª„É≥„Éã„Ç∏„É•„Ç¶„É≠„ÇØ„Éç„É≥„Éé„Ç∑„É≥„Çµ„ÇØ„Ç≤„Éº„É†„Éè„Çµ„Ç§„Ç≥„Ç¶„ÉÄ„ÄÇ`

In [None]:
from sudachipy import tokenizer as sudachi_tokenizer, dictionary as sudachi_dictionary
import re
import json

# Setup SudachiPy with FULL dictionary
# Mode C = longest possible splits (best for IME: treats compound words as single unit)
sudachi_dict = sudachi_dictionary.Dictionary(dict="full")
sudachi = sudachi_dict.create()
SPLIT_MODE = sudachi_tokenizer.Tokenizer.SplitMode.C


def text_to_morphemes(text):
    """Analyze text into morphemes with readings.
    Returns: [(surface, reading, pos), ...]
    """
    tokens = sudachi.tokenize(text, SPLIT_MODE)
    result = []
    for t in tokens:
        surface = t.surface()
        reading = t.reading_form()  # Katakana reading
        pos = t.part_of_speech()[0]  # Main POS
        
        # If reading is empty or same as surface (unknown word),
        # keep the surface as-is
        if not reading:
            reading = surface
        
        result.append((surface, reading, pos))
    return result


def text_to_kana(text):
    """Convert text to full katakana reading."""
    morphemes = text_to_morphemes(text)
    return ''.join(reading for _, reading, _ in morphemes)


def generate_full_pair(text, left_context=''):
    """Generate a full sentence training pair.
    Returns: {left_context, input (kana), output (kanji/mixed)}
    """
    kana = text_to_kana(text)
    if not kana or kana == text:  # No conversion needed (already kana)
        return None
    
    return {
        'left_context': left_context,
        'input': kana,
        'output': text
    }


def generate_sliced_pairs(text, left_context=''):
    """Generate multiple sliced training pairs from one sentence.
    
    This teaches "Next Word Prediction" by creating pairs at each word boundary.
    
    Example: "2026Âπ¥„ÅÆÊñ∞‰Ωú„Ç≤„Éº„É†„ÅØÊúÄÈ´ò„Å†„ÄÇ"
    Slice 1: ctx=""        input="„Éã„Çª„É≥„Éã„Ç∏„É•„Ç¶„É≠„ÇØ„Éç„É≥" output="2026Âπ¥"
    Slice 2: ctx="2026Âπ¥"  input="„Éé„Ç∑„É≥„Çµ„ÇØ"           output="„ÅÆÊñ∞‰Ωú"
    Slice 3: ctx="2026Âπ¥„ÅÆ" input="„Ç∑„É≥„Çµ„ÇØ„Ç≤„Éº„É†"       output="Êñ∞‰Ωú„Ç≤„Éº„É†"
    ...
    """
    morphemes = text_to_morphemes(text)
    if len(morphemes) < MIN_SLICE_WORDS:
        return []
    
    pairs = []
    
    # Full sentence pair
    full_kana = ''.join(r for _, r, _ in morphemes)
    full_text = ''.join(s for s, _, _ in morphemes)
    if full_kana != full_text:
        pairs.append({
            'left_context': left_context,
            'input': full_kana,
            'output': full_text
        })
    
    # Sliced pairs at word boundaries
    # Slide through morphemes, creating pairs every 1-3 words
    for slice_size in [1, 2, 3]:
        for i in range(0, len(morphemes) - slice_size + 1):
            # Context = everything before this slice
            ctx_parts = [s for s, _, _ in morphemes[:i]]
            ctx = left_context + ''.join(ctx_parts)
            
            # Slice = current words
            slice_morphemes = morphemes[i:i + slice_size]
            slice_kana = ''.join(r for _, r, _ in slice_morphemes)
            slice_text = ''.join(s for s, _, _ in slice_morphemes)
            
            # Skip if kana == text (no conversion, e.g. pure katakana word)
            if slice_kana == slice_text:
                continue
            
            # Skip very short or punctuation-only
            if len(slice_text) < 1:
                continue
            if re.match(r'^[„ÄÅ„ÄÇÔºüÔºÅ\s]+$', slice_text):
                continue
            
            pairs.append({
                'left_context': ctx[-60:] if len(ctx) > 60 else ctx,  # Trim context
                'input': slice_kana,
                'output': slice_text
            })
    
    return pairs


# Quick test
print("üß™ Reverse-Kana Pipeline Test:")
print("=" * 50)

test_texts = [
    '2026Âπ¥„ÅÆÊñ∞‰Ωú„Ç≤„Éº„É†„ÅØÊúÄÈ´ò„Å†„ÄÇ',
    '„Éû„Ç∏„ÅßÔºü„Åù„Çå„Å£„Å¶„É§„Éê„Åè„Å™„ÅÑÔºü',
    'Êñ∞„Åó„ÅÑiPhone„ÅÆ„Ç´„É°„É©„ÄÅ„Åô„Åî„Åè„Å™„ÅÑÔºü',
    '‰ªäÊó•„ÅØ„Å®„Å¶„ÇÇÊöë„ÅÑ„Åß„Åô„Å≠„ÄÇ',
    'ÂèãÈÅî„Å®‰∏ÄÁ∑í„Å´Êò†Áîª„ÇíË¶ã„Å´Ë°å„Åç„Åæ„Åó„Åü„ÄÇ',
]

for text in test_texts:
    kana = text_to_kana(text)
    morphemes = text_to_morphemes(text)
    print(f"\n  Text: {text}")
    print(f"  Kana: {kana}")
    print(f"  Morphemes: {[(s, r) for s, r, _ in morphemes[:6]]}{'...' if len(morphemes) > 6 else ''}")
    
    if ENABLE_SLICING:
        slices = generate_sliced_pairs(text)
        print(f"  Slices ({len(slices)}):")
        for p in slices[:3]:
            ctx = p['left_context'][:10] or ''
            print(f"    ctx={ctx} | {p['input'][:15]} ‚Üí {p['output'][:15]}")

## 2. Text Extraction & Filtering

Extract clean conversational sentences from raw text.

In [None]:
# ==========================================================
# Sentence extraction + quality filters
# ==========================================================

# Kill patterns (same as filter_dataset_quality.ipynb)
KILL_PATTERNS = re.compile(
    r'(ID:|„Ç¢„Ç§„Éá„Ç£:|ÊäïÁ®øÊó•|ÂêçÂâç:|ÂêçÁÑ°„Åó|'
    r'\d{2}:\d{2}:\d{2}|'              # Timestamps
    r'(http|www\.|https)|'              # URLs
    r'ISBN|ISSN|'
    r'„Ç®„Ç§„ÉÅ„ÉÜ„Ç£„Éº„ÉÜ„Ç£„Éº„Éî„Éº|'
    r'„ÉÄ„Éñ„É™„É•„ÉÄ„Éñ„É™„É•)',
    re.IGNORECASE
)

# Garbage characters (Cyrillic, extended Latin, etc.)
GARBAGE_RE = re.compile(r'[\u0400-\u04FF\u0100-\u024F]')

# Encyclopedia patterns
ENCYCLOPEDIA_RE = re.compile(
    r'(„Å´‰ΩçÁΩÆ„Åô„Çã|„Å´ÊâÄÂú®„Åô„Çã|ÂåóÁ∑Ø\d|ÂçóÁ∑Ø\d|Êù±Áµå\d|Ë•øÁµå\d|'
    r'Ê®ôÈ´ò\d|Êµ∑Êäú\d|Â≠¶Âêç|ÂàÜÈ°ûÂ≠¶|Á¨¨\d+‰ª£|Á¥ÄÂÖÉÂâç)'
)

# Japanese text detection
JAPANESE_RE = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]')

# Dialogue brackets
DIALOGUE_RE = re.compile(r'„Äå([^„Äç]{2,80})„Äç')


def extract_sentences(text):
    """Extract clean sentences from a text block.
    Returns list of (sentence, is_dialogue) tuples.
    """
    results = []
    
    # Quick reject: no Japanese at all
    if not JAPANESE_RE.search(text):
        return []
    
    # Extract dialogue first (highest quality conversational text)
    dialogues = DIALOGUE_RE.findall(text)
    for d in dialogues:
        d = d.strip()
        if len(d) >= MIN_SENTENCE_LEN and len(d) <= MAX_SENTENCE_LEN:
            if not KILL_PATTERNS.search(d) and not GARBAGE_RE.search(d):
                results.append((d, True))  # is_dialogue = True
    
    # Split by sentence endings for non-dialogue
    sentences = re.split(r'[„ÄÇÔºÅÔºü\n]+', text)
    for s in sentences:
        s = s.strip()
        if not s:
            continue
        if len(s) < MIN_SENTENCE_LEN or len(s) > MAX_SENTENCE_LEN:
            continue
        
        # Kill filters
        if KILL_PATTERNS.search(s):
            continue
        if GARBAGE_RE.search(s):
            continue
        if ENCYCLOPEDIA_RE.search(s):
            continue
        
        # Must have enough Japanese chars (not mostly ASCII/numbers)
        jp_ratio = len(JAPANESE_RE.findall(s)) / max(len(s), 1)
        if jp_ratio < 0.3:
            continue
        
        # Too many numbers = stats/data table
        num_count = len(re.findall(r'\d+', s))
        if num_count >= 3:
            continue
        
        results.append((s, False))
    
    return results


# Quick test
print("üß™ Sentence Extraction Test:")
test_block = """Áî∞‰∏≠„Åï„Çì„ÅØ„Äå„Éû„Ç∏„ÅßÔºü„Åù„Çå„Å£„Å¶„É§„Éê„Åè„Å™„ÅÑÔºü„Äç„Å®Ë®Ä„Å£„Åü„ÄÇ
ÂåóÁ∑Ø35Â∫¶„Å´‰ΩçÁΩÆ„Åô„ÇãÈÉΩÂ∏Ç„Åß„ÄÅ‰∫∫Âè£„ÅØÁ¥Ñ3500‰∏á‰∫∫„Åß„ÅÇ„Çã„ÄÇ
Êñ∞„Åó„ÅÑiPhone„ÅÆ„Ç´„É°„É©„Åå„Åô„Åî„ÅÑ„ÄÇÂèãÈÅî„ÇÇË≤∑„Å£„Åü„Çâ„Åó„ÅÑ„ÄÇ
„Äå‰ªäÊó•„ÅØÂ§©Ê∞ó„Åå„ÅÑ„ÅÑ„Åã„ÇâÊï£Ê≠©„Å´Ë°å„Åì„ÅÜ„Äç„Å®ÊØç„ÅåË®Ä„Å£„Åü„ÄÇ
2023Âπ¥3Êúà15Êó•„ÅÆhttp://example.com„Å´„Çà„Çã„Å®„ÄÇ
„Éá„Éê„ÉÉ„Ç∞„ÅÆ„Åü„ÇÅ„Å´app„Çí„Ç¢„ÉÉ„Éó„Éá„Éº„Éà„Åó„Åæ„Åó„Åü„ÄÇ"""

extracted = extract_sentences(test_block)
for sent, is_dialog in extracted:
    tag = 'üí¨' if is_dialog else 'üìù'
    print(f"  {tag} {sent[:50]}")

## 3. Load Data Source

Free sources available:
- `cc100`: Japanese web text (grammar base, ~70GB streamed)
- `oscar`: Web crawl (diverse)
- `mc4`: Cleaned C4 (high quality)
- `custom`: Your own text lines

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm

def load_data_source(source, max_lines):
    """Load raw text lines from various free sources.
    Uses streaming to avoid downloading entire dataset.
    """
    lines = []
    
    if source == 'cc100':
        print("üì• Loading CC-100 Japanese (streaming)...")
        ds = load_dataset('cc100', lang='ja', split='train', streaming=True)
        text_key = 'text'
        
    elif source == 'oscar':
        print("üì• Loading OSCAR Japanese (streaming)...")
        ds = load_dataset('oscar-corpus/OSCAR-2301', 'ja',
                          split='train', streaming=True,
                          trust_remote_code=True)
        text_key = 'text'
        
    elif source == 'mc4':
        print("üì• Loading mC4 Japanese (streaming)...")
        ds = load_dataset('mc4', 'ja', split='train', streaming=True)
        text_key = 'text'
    
    elif source == 'custom':
        print("üìù Using custom text lines.")
        print("   Set CUSTOM_TEXTS list or load from file.")
        return CUSTOM_TEXTS if 'CUSTOM_TEXTS' in dir() else []
    
    else:
        raise ValueError(f"Unknown source: {source}")
    
    # Stream and collect
    for item in tqdm(ds, desc=f"Loading {source}", total=max_lines):
        text = item.get(text_key, '')
        if text and len(text) >= MIN_SENTENCE_LEN:
            lines.append(text)
        if len(lines) >= max_lines:
            break
    
    print(f"‚úì Loaded {len(lines):,} text blocks from {source}")
    return lines


# --- Optional: Custom text for testing ---
# Uncomment and add your own lines here:
# CUSTOM_TEXTS = [
#     'Êñ∞„Åó„ÅÑiPhone„ÅÆ„Ç´„É°„É©„Åå„Åô„Åî„ÅÑ„ÄÇÂèãÈÅî„ÇÇË≤∑„Å£„Åü„Çâ„Åó„ÅÑ„ÄÇ',
#     '„Äå„Éû„Ç∏„ÅßÔºü„Åù„Çå„Å£„Å¶„É§„Éê„Åè„Å™„ÅÑÔºü„Äç„Å®Ë®Ä„Å£„Åü„ÄÇ',
#     '‰ªäÊó•„ÅØÂ§©Ê∞ó„Åå„ÅÑ„ÅÑ„Åã„ÇâÊï£Ê≠©„Å´Ë°å„Åì„ÅÜ„Åã„Å™„ÄÇ',
# ]
# DATA_SOURCE = 'custom'

raw_lines = load_data_source(DATA_SOURCE, MAX_RAW_LINES)

## 4. Process: Extract ‚Üí Kana ‚Üí Pairs

In [None]:
# ============================================================
# Main processing pipeline
# ============================================================

OUTPUT_FILE = f"{OUTPUT_DIR}/custom_ime_dataset.jsonl"
PROGRESS_FILE = f"{OUTPUT_DIR}/build_progress.json"

# Stats
stats = {
    'raw_lines': len(raw_lines),
    'sentences_extracted': 0,
    'dialogue_count': 0,
    'pairs_generated': 0,
    'slices_generated': 0,
    'errors': 0,
}

# Resume support
start_from = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, 'r') as f:
        progress = json.load(f)
    start_from = progress.get('processed_lines', 0)
    stats.update(progress.get('stats', {}))
    print(f"üìÇ Resuming from line {start_from:,}")

# Previous sentence for left_context (consecutive sentences in same block)
prev_sentence = ''

mode = 'a' if start_from > 0 else 'w'
with open(OUTPUT_FILE, mode, encoding='utf-8') as out_f:
    for line_idx in tqdm(range(start_from, len(raw_lines)), desc="Processing"):
        text_block = raw_lines[line_idx]
        
        # Extract sentences from this block
        sentences = extract_sentences(text_block)
        prev_sentence = ''  # Reset context between blocks
        
        for sentence, is_dialogue in sentences:
            stats['sentences_extracted'] += 1
            if is_dialogue:
                stats['dialogue_count'] += 1
            
            try:
                if ENABLE_SLICING:
                    # Generate multiple sliced pairs
                    pairs = generate_sliced_pairs(sentence, left_context=prev_sentence)
                    for p in pairs:
                        out_f.write(json.dumps(p, ensure_ascii=False) + '\n')
                    stats['slices_generated'] += len(pairs)
                    stats['pairs_generated'] += len(pairs)
                else:
                    # Generate single full pair
                    pair = generate_full_pair(sentence, left_context=prev_sentence)
                    if pair:
                        out_f.write(json.dumps(pair, ensure_ascii=False) + '\n')
                        stats['pairs_generated'] += 1
                
                # Use this sentence as context for next one
                prev_sentence = sentence
                
            except Exception as e:
                stats['errors'] += 1
                if stats['errors'] <= 5:
                    print(f"  ‚ö† Error: {e} | text: {sentence[:30]}")
        
        # Save progress every 10K lines
        if (line_idx - start_from) % 10_000 == 0 and line_idx > start_from:
            with open(PROGRESS_FILE, 'w') as pf:
                json.dump({'processed_lines': line_idx, 'stats': stats}, pf)
            out_f.flush()
        
        # Stop if we have enough pairs
        if stats['pairs_generated'] >= MAX_OUTPUT_PAIRS:
            print(f"\n‚úì Reached {MAX_OUTPUT_PAIRS:,} pairs limit.")
            break

# Final save
with open(PROGRESS_FILE, 'w') as pf:
    json.dump({'processed_lines': line_idx + 1, 'stats': stats}, pf)

print(f"\n‚úì Done!")
print(f"  Raw lines processed: {line_idx - start_from + 1:,}")
print(f"  Sentences extracted: {stats['sentences_extracted']:,}")
print(f"  Dialogues („Äå„Äç):    {stats['dialogue_count']:,}")
print(f"  Training pairs:     {stats['pairs_generated']:,}")
print(f"  Errors:             {stats['errors']:,}")
print(f"üíæ Saved: {OUTPUT_FILE}")

## 5. Results & Verification

In [None]:
# Count final output
total_pairs = 0
samples = []
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        total_pairs += 1
        if len(samples) < 50:
            samples.append(json.loads(line))

file_size = os.path.getsize(OUTPUT_FILE)

print("="*60)
print("üìä DATASET BUILD REPORT")
print("="*60)
print(f"  Source:           {DATA_SOURCE}")
print(f"  Raw lines:        {stats['raw_lines']:,}")
print(f"  Sentences found:  {stats['sentences_extracted']:,}")
print(f"  Dialogues („Äå„Äç): {stats['dialogue_count']:,}")
print(f"  Training pairs:   {total_pairs:,}")
print(f"  File size:        {file_size / (1024*1024):.1f} MB")
print(f"  File:             {OUTPUT_FILE}")

# Pair type breakdown
has_context = sum(1 for s in samples if s.get('left_context'))
avg_input_len = sum(len(s['input']) for s in samples) / max(len(samples), 1)
avg_output_len = sum(len(s['output']) for s in samples) / max(len(samples), 1)

print(f"\nüìà Sample Stats (first {len(samples)} pairs):")
print(f"  With context:     {has_context}/{len(samples)} ({has_context/max(len(samples),1)*100:.0f}%)")
print(f"  Avg input len:    {avg_input_len:.1f} chars (kana)")
print(f"  Avg output len:   {avg_output_len:.1f} chars (kanji)")

In [None]:
# Show sample pairs
print("\nüìù Sample Training Pairs:")
print("-" * 60)

# Show dialogue pairs first
print("\nüí¨ Dialogue-origin pairs:")
shown = 0
for s in samples:
    if shown >= 5: break
    ctx = s['left_context'][:15] or ''
    print(f"  {ctx}<SEP>{s['input'][:20]} ‚Üí {s['output'][:25]}")
    shown += 1

# Show pairs with context
print("\nüìù Pairs with context:")
shown = 0
for s in samples:
    if not s['left_context']: continue
    if shown >= 5: break
    ctx = s['left_context'][:15]
    print(f"  ctx={ctx} | {s['input'][:15]} ‚Üí {s['output'][:20]}")
    shown += 1

# Show short pairs (single word conversions)
print("\nüî§ Single-word conversions:")
shown = 0
for s in samples:
    if len(s['output']) > 5: continue
    if shown >= 5: break
    ctx = s['left_context'][:15] or ''
    print(f"  {ctx}<SEP>{s['input']} ‚Üí {s['output']}")
    shown += 1

In [None]:
# Quality spot-check: verify kana‚Üíkanji conversions are correct
import random

print("\nüîç Quality Spot-Check (random 10 pairs):")
print("-" * 60)

random.seed(42)
check_samples = random.sample(samples, min(10, len(samples)))

for s in check_samples:
    # Verify: re-convert output to kana and compare with input
    expected_kana = text_to_kana(s['output'])
    match = expected_kana == s['input']
    status = '‚úÖ' if match else '‚ö†Ô∏è'
    
    print(f"  {status} {s['input'][:20]} ‚Üí {s['output'][:25]}")
    if not match:
        print(f"       Re-check: {expected_kana[:20]}")

In [None]:
# How to use in training notebooks
print("\n" + "="*60)
print("üìã HOW TO USE IN TRAINING")
print("="*60)
print(f"""
Option A: Use ALONE (fresh data only)
  dataset_file = "{OUTPUT_FILE}"

Option B: COMBINE with filtered zenz (recommended)
  files = [
      "{OUTPUT_DIR}/filtered_high_quality.jsonl",  # Filtered zenz
      "{OUTPUT_FILE}",  # Fresh custom data
  ]
  dataset = []
  for f in files:
      with open(f, 'r') as fh:
          for line in fh:
              dataset.append(json.loads(line))

Option C: FINE-TUNE strategy
  1. Train on filtered zenz (base grammar) ‚Äî full epochs
  2. Fine-tune last 2-3 epochs on fresh data (modern vocab)
  This makes "fresh" vocabulary more likely in predictions.
""")

In [None]:
# Cleanup
print("\nüßπ Cleanup options:")
print(f"  Progress file: {PROGRESS_FILE}")
print(f"  Delete after verifying: os.remove('{PROGRESS_FILE}')")

# Memory cleanup
del raw_lines
import gc; gc.collect()
print("‚úì Released raw_lines from memory")