# üîç CodeGuardian Tokenization Validation Notebook

**Author:** Urva Gandhi  
**Purpose:** Validate tokenized outputs match original dataset splits

This notebook verifies:
1. ‚úÖ Row counts match exactly between JSONL and tokenized .pt files
2. ‚úÖ Label distributions are preserved
3. ‚úÖ No data loss during tokenization
4. ‚úÖ Both CodeBERT and GraphCodeBERT outputs are consistent

In [None]:
import json
import torch
import os
from collections import Counter
from pathlib import Path

print("üöÄ CodeGuardian Tokenization Validation")
print("=" * 80)

## üìÅ Configuration

In [None]:
# Paths
JSONL_DIR = "/kaggle/input/codeguardian-dataset-for-model-fine-tuning/random_splitted"
TOKENIZED_BASE = "/kaggle/working/tokenized"

# Splits to check
SPLITS = ["train", "val", "test"]

# Models
MODELS = ["codebert", "graphcodebert"]

print(f"JSONL Directory: {JSONL_DIR}")
print(f"Tokenized Base: {TOKENIZED_BASE}")
print(f"Splits: {SPLITS}")
print(f"Models: {MODELS}")

## üìä Step 1: Count Original JSONL Rows

In [None]:
def count_jsonl_samples(file_path):
    """Count samples and label distribution in JSONL file"""
    total = 0
    labels = []
    valid = 0

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            total += 1
            try:
                item = json.loads(line.strip())
                code = item.get("code", "")
                label = item.get("is_vulnerable", None)

                # Apply same validation logic as tokenizer
                if isinstance(code, str) and code.strip() != "" and label is not None:
                    label_int = int(label)
                    if label_int in [0, 1]:
                        labels.append(label_int)
                        valid += 1
            except:
                pass

    label_counts = Counter(labels)
    return total, valid, label_counts

print("\n" + "=" * 80)
print("ORIGINAL JSONL STATISTICS")
print("=" * 80)

jsonl_stats = {}
for split in SPLITS:
    file_path = os.path.join(JSONL_DIR, f"{split}.jsonl")
    total, valid, label_counts = count_jsonl_samples(file_path)
    jsonl_stats[split] = {
        "total": total,
        "valid": valid,
        "labels": label_counts
    }

    print(f"\n{split.upper()}:")
    print(f"  Total lines: {total:,}")
    print(f"  Valid samples: {valid:,}")
    print(f"  Skipped: {total - valid:,}")
    print(f"  Label 0 (Secure): {label_counts[0]:,} ({100*label_counts[0]/valid:.2f}%)")
    print(f"  Label 1 (Vulnerable): {label_counts[1]:,} ({100*label_counts[1]/valid:.2f}%)")

total_valid = sum(s["valid"] for s in jsonl_stats.values())
print(f"\n{'='*80}")
print(f"TOTAL VALID SAMPLES: {total_valid:,}")
print(f"{'='*80}")

## üì¶ Step 2: Validate CodeBERT Tokenized Files

In [None]:
def validate_tokenized_file(file_path, expected_count, expected_labels):
    """Validate a tokenized .pt file"""
    if not os.path.exists(file_path):
        return False, "File not found", {}

    try:
        data = torch.load(file_path, map_location="cpu")

        # Check keys
        required_keys = ["input_ids", "attention_mask", "labels"]
        if not all(k in data for k in required_keys):
            return False, "Missing required keys", {}

        # Check counts
        actual_count = len(data["labels"])
        if actual_count != expected_count:
            return False, f"Count mismatch: expected {expected_count}, got {actual_count}", {}

        # Check label distribution
        labels = data["labels"].tolist()
        label_counts = Counter(labels)

        if label_counts != expected_labels:
            return False, f"Label mismatch", label_counts

        # Check shapes
        input_shape = data["input_ids"].shape
        mask_shape = data["attention_mask"].shape

        if input_shape[1] != 512 or mask_shape[1] != 512:
            return False, "Incorrect sequence length", label_counts

        return True, "Valid", label_counts

    except Exception as e:
        return False, f"Load error: {str(e)}", {}

print("\n" + "=" * 80)
print("CODEBERT TOKENIZED VALIDATION")
print("=" * 80)

codebert_valid = True
for split in SPLITS:
    file_path = os.path.join(TOKENIZED_BASE, "codebert", f"{split}_tokenized.pt")
    expected_count = jsonl_stats[split]["valid"]
    expected_labels = jsonl_stats[split]["labels"]

    valid, message, actual_labels = validate_tokenized_file(file_path, expected_count, expected_labels)

    print(f"\n{split.upper()}: {'‚úÖ' if valid else '‚ùå'} {message}")
    if valid:
        print(f"  Samples: {expected_count:,}")
        print(f"  Label 0: {actual_labels[0]:,}")
        print(f"  Label 1: {actual_labels[1]:,}")
    else:
        codebert_valid = False
        if actual_labels:
            print(f"  Expected: {expected_labels}")
            print(f"  Actual: {actual_labels}")

if codebert_valid:
    print("\n‚úÖ CodeBERT tokenization: PASSED")
else:
    print("\n‚ùå CodeBERT tokenization: FAILED")

## üì¶ Step 3: Validate GraphCodeBERT Tokenized Files

In [None]:
print("\n" + "=" * 80)
print("GRAPHCODEBERT TOKENIZED VALIDATION")
print("=" * 80)

graphcodebert_valid = True
for split in SPLITS:
    file_path = os.path.join(TOKENIZED_BASE, "graphcodebert", f"{split}_tokenized.pt")
    expected_count = jsonl_stats[split]["valid"]
    expected_labels = jsonl_stats[split]["labels"]

    valid, message, actual_labels = validate_tokenized_file(file_path, expected_count, expected_labels)

    print(f"\n{split.upper()}: {'‚úÖ' if valid else '‚ùå'} {message}")
    if valid:
        print(f"  Samples: {expected_count:,}")
        print(f"  Label 0: {actual_labels[0]:,}")
        print(f"  Label 1: {actual_labels[1]:,}")
    else:
        graphcodebert_valid = False
        if actual_labels:
            print(f"  Expected: {expected_labels}")
            print(f"  Actual: {actual_labels}")

if graphcodebert_valid:
    print("\n‚úÖ GraphCodeBERT tokenization: PASSED")
else:
    print("\n‚ùå GraphCodeBERT tokenization: FAILED")

## üîç Step 4: Cross-Model Consistency Check

In [None]:
print("\n" + "=" * 80)
print("CROSS-MODEL CONSISTENCY CHECK")
print("=" * 80)

consistency_check = True

for split in SPLITS:
    codebert_path = os.path.join(TOKENIZED_BASE, "codebert", f"{split}_tokenized.pt")
    graphcodebert_path = os.path.join(TOKENIZED_BASE, "graphcodebert", f"{split}_tokenized.pt")

    try:
        codebert_data = torch.load(codebert_path, map_location="cpu")
        graphcodebert_data = torch.load(graphcodebert_path, map_location="cpu")

        # Check sample counts
        cb_count = len(codebert_data["labels"])
        gcb_count = len(graphcodebert_data["labels"])

        # Check label distributions
        cb_labels = Counter(codebert_data["labels"].tolist())
        gcb_labels = Counter(graphcodebert_data["labels"].tolist())

        if cb_count == gcb_count and cb_labels == gcb_labels:
            print(f"\n{split.upper()}: ‚úÖ Consistent")
            print(f"  Both models: {cb_count:,} samples")
            print(f"  Both models: Label 0={cb_labels[0]:,}, Label 1={cb_labels[1]:,}")
        else:
            print(f"\n{split.upper()}: ‚ùå Inconsistent")
            print(f"  CodeBERT: {cb_count:,} samples, {cb_labels}")
            print(f"  GraphCodeBERT: {gcb_count:,} samples, {gcb_labels}")
            consistency_check = False

    except Exception as e:
        print(f"\n{split.upper()}: ‚ùå Error - {str(e)}")
        consistency_check = False

if consistency_check:
    print("\n‚úÖ Cross-model consistency: PASSED")
else:
    print("\n‚ùå Cross-model consistency: FAILED")

## üìà Step 5: Final Summary

In [None]:
print("\n" + "=" * 80)
print("FINAL VALIDATION SUMMARY")
print("=" * 80)

all_checks = [
    ("CodeBERT Tokenization", codebert_valid),
    ("GraphCodeBERT Tokenization", graphcodebert_valid),
    ("Cross-Model Consistency", consistency_check)
]

print("\nüìä Validation Results:")
for check_name, passed in all_checks:
    status = "‚úÖ PASSED" if passed else "‚ùå FAILED"
    print(f"  {check_name}: {status}")

all_passed = all(passed for _, passed in all_checks)

if all_passed:
    print("\n" + "=" * 80)
    print("üéâ ALL VALIDATION CHECKS PASSED!")
    print("=" * 80)
    print("\n‚úÖ Tokenized datasets are ready for training")
    print("‚úÖ No data loss detected")
    print("‚úÖ Label distributions preserved")
    print("‚úÖ Both models have consistent outputs")
    print("\nüí° Next Steps:")
    print("  1. Proceed with LoRA fine-tuning")
    print("  2. Run train_codebert_lora.py")
    print("  3. Run train_graphcodebert_lora.py")
    print("  4. Create hybrid ensemble model")
else:
    print("\n" + "=" * 80)
    print("‚ö†Ô∏è VALIDATION FAILED")
    print("=" * 80)
    print("\n‚ùå Please review the errors above and re-run tokenization")
    print("\nüí° Troubleshooting:")
    print("  1. Check if tokenization scripts completed successfully")
    print("  2. Verify JSONL files are not corrupted")
    print("  3. Ensure sufficient disk space")
    print("  4. Re-run tokenization scripts if needed")

print("\n" + "=" * 80)

## üìä Bonus: Sample Inspection

In [None]:
# Load first 5 samples from train set to inspect
print("\n" + "=" * 80)
print("SAMPLE INSPECTION (First 5 train samples)")
print("=" * 80)

train_codebert = torch.load(
    os.path.join(TOKENIZED_BASE, "codebert", "train_tokenized.pt"),
    map_location="cpu"
)

print("\nCodeBERT Samples:")
for i in range(min(5, len(train_codebert["labels"]))):
    label = train_codebert["labels"][i].item()
    num_tokens = train_codebert["attention_mask"][i].sum().item()
    print(f"  Sample {i+1}: Label={label}, Active Tokens={num_tokens}/512")

print("\n‚úÖ Validation notebook completed!")