# Phase 2 - Data Preparation (Offline Version)

**Objective**: Load FIM `.jsonl` dataset and split by repository into train/val/test sets.

**Environment**: Offline GPU machine with Jupyter Lab

In [None]:
import os
import json
import glob
import random
import numpy as np
from collections import defaultdict

# === CONFIG: Update paths as needed ===
FIM_DATA_PATH = './fim_dataset.jsonl'
OUTPUT_DIR = './split_data'

print(f"FIM Dataset: {FIM_DATA_PATH}")
print(f"Output Dir: {OUTPUT_DIR}")

In [None]:
def load_jsonl(file_path):
    samples = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line.strip())
                samples.append(data)
            except json.JSONDecodeError as e:
                print(f"Warning: Skipping invalid JSON at line {line_num}: {e}")
    return samples

def validate_fim_format(sample):
    text = sample.get('text', '')
    return '<PRE>' in text and '<SUF>' in text and '<MID>' in text

print(f"Loading {FIM_DATA_PATH}...")
samples = load_jsonl(FIM_DATA_PATH)
valid_samples = [s for s in samples if validate_fim_format(s)]
print(f"Total samples: {len(samples):,}")
print(f"Valid FIM samples: {len(valid_samples):,}")

In [None]:
import re

def extract_repository(sample):
    if 'repository' in sample:
        return sample['repository']
    if 'file_path' in sample:
        path = sample['file_path']
        parts = path.split('/')
        if len(parts) >= 3:
            return f"{parts[1]}/{parts[2]}"
    text_sample = sample.get('text', '')[:100]
    return f"unknown_{hash(text_sample) % 10000}"

repo_to_samples = defaultdict(list)
for sample in valid_samples:
    repo = extract_repository(sample)
    repo_to_samples[repo].append(sample)

print(f"Total repositories: {len(repo_to_samples):,}")

In [None]:
random.seed(42)
np.random.seed(42)

all_repos = list(repo_to_samples.keys())
random.shuffle(all_repos)

n_repos = len(all_repos)
train_end = int(n_repos * 0.80)
val_end = int(n_repos * 0.85)

train_repos = all_repos[:train_end]
val_repos = all_repos[train_end:val_end]
test_repos = all_repos[val_end:]

train_samples = [s for repo in train_repos for s in repo_to_samples[repo]]
val_samples = [s for repo in val_repos for s in repo_to_samples[repo]]
test_samples = [s for repo in test_repos for s in repo_to_samples[repo]]

print(f"Train: {len(train_samples):,} samples ({len(train_repos)} repos)")
print(f"Val: {len(val_samples):,} samples ({len(val_repos)} repos)")
print(f"Test: {len(test_samples):,} samples ({len(test_repos)} repos)")

In [None]:
# Verify no data leakage
train_set = set(train_repos)
val_set = set(val_repos)
test_set = set(test_repos)

assert len(train_set & val_set) == 0, "Leakage: Train/Val"
assert len(train_set & test_set) == 0, "Leakage: Train/Test"
assert len(val_set & test_set) == 0, "Leakage: Val/Test"
print("✅ No data leakage detected!")

In [None]:
def save_jsonl(samples, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for sample in samples:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
    print(f"✓ Saved {len(samples):,} samples to {file_path}")

os.makedirs(OUTPUT_DIR, exist_ok=True)
save_jsonl(train_samples, f'{OUTPUT_DIR}/train.jsonl')
save_jsonl(val_samples, f'{OUTPUT_DIR}/val.jsonl')
save_jsonl(test_samples, f'{OUTPUT_DIR}/test.jsonl')

print("\n✅ Data preparation complete!")
print("Next: Run 02_training_offline.ipynb")