In [None]:
!pip install -U FlagEmbedding[finetune]

In [None]:
!pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl

In [None]:
!rm -rf /kaggle/working/test_encoder_only_base_multilingual-e5-base/checkpoint-6945

In [None]:
from pathlib import Path
import os
import json

In [None]:
ds_stage = {
    "zero_optimization": {
        "stage": 0
    },
    
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 12,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "bf16": {
        "enabled": "auto"
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

# dump to file
with open('/kaggle/working/ds_stage0.json', 'w') as file:
    json.dump(ds_stage, file)

In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
!torchrun --nproc_per_node 2 \
	-m FlagEmbedding.finetune.embedder.encoder_only.base \
	--model_name_or_path intfloat/multilingual-e5-base \
    --cache_dir /kaggle/working/cache/model \
    --train_data /kaggle/input/course-esco-skill-retrieval/train_dataset.jsonl \
    --cache_path /kaggle/working/cache/data \
    --train_group_size 8 \
    --query_max_len 512 \
    --passage_max_len 512 \
    --pad_to_multiple_of 8 \
    --query_instruction_for_retrieval 'query: ' \
    --query_instruction_format '{}{}' \
    --passage_instruction_for_retrieval 'passage: ' \
    --passage_instruction_format '{}{}' \
    --knowledge_distillation False \
	--output_dir ./test_encoder_only_base_multilingual-e5-base \
    --overwrite_output_dir \
    --learning_rate 2e-5 \
    --fp16 \
    --num_train_epochs 5 \
    --per_device_train_batch_size 2 \
    --dataloader_drop_last True \
    --warmup_ratio 0.2 \
    --gradient_checkpointing \
    --deepspeed /kaggle/working/ds_stage0.json \
    --logging_steps 50 \
    --save_steps 2000 \
    --save_total_limit 1 \
    --negatives_cross_device \
    --temperature 0.02 \
    --sentence_pooling_method cls \
    --normalize_embeddings True \
    --kd_loss_type kl_div

In [None]:
from sentence_transformers import SentenceTransformer
from pathlib import Path
import shutil

# Load trained model (FlagEmbedding output directory)
raw_output_dir = Path('/kaggle/working/test_encoder_only_base_multilingual-e5-base')
print('Raw output dir exists:', raw_output_dir.exists())

# Load the model using SentenceTransformer (this works for many HuggingFace-style layouts)
model = SentenceTransformer(str(raw_output_dir))
print('Loaded model OK. Saving a clean deployable copy...')

# Create a clean deploy folder and save using SentenceTransformer.save()
deploy_dir = Path(str(raw_output_dir) + '_deployed')
if deploy_dir.exists():
    print('Deploy dir already exists. Removing and recreating to ensure clean layout.')
    shutil.rmtree(deploy_dir)

# Save a clean SentenceTransformers-style model suitable for deployment
model.save(str(deploy_dir))
print('Saved deployed model to:', deploy_dir)

# Optionally remove intermediate checkpoint directories in raw output to save space
for cp in raw_output_dir.glob('checkpoint-*'):
    try:
        print('Removing checkpoint:', cp.name)
        shutil.rmtree(cp)
    except Exception as e:
        print('Failed to remove checkpoint', cp, e)

# Update a variable used by packaging cell
deployed_model_dir = deploy_dir
print('Deployed model directory set to:', deployed_model_dir)


In [None]:
# üß™ Comprehensive Model Testing and Analysis

# üìÇ Load training data
def load_jsonl(file_path):
    """Load JSONL data with progress info."""
    data = []
    
    print(f"üìÇ Loading: {file_path.name}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if line:
                try:
                    sample = json.loads(line)
                    data.append(sample)
                    
                    # Show examples
                    if i < 3:  # Show first 3 examples
                        query_len = len(sample.get('query', ''))
                        pos_count = len(sample.get('pos', []))
                        neg_count = len(sample.get('neg', []))
                        print(f"   Example {i+1}: {query_len} chars, {pos_count} pos, {neg_count} neg")
                        
                except json.JSONDecodeError:
                    continue
    
    print(f"‚úÖ Loaded {len(data)} samples")
    return data

# Load evaluation data if available
eval_data = []
eval_data = load_jsonl(Path("/kaggle/input/course-esco-skill-retrieval/eval_dataset.jsonl"))
print(f"‚úÖ Loaded {len(eval_data)} evaluation samples")

if eval_data and len(eval_data) > 0:
    print("üß™ Comprehensive Model Testing & Analysis")
    print("=" * 50)
    
    # Test with multiple samples for robust evaluation
    test_samples = eval_data[:10]  # Test with first 5 samples
    
    all_pos_similarities = []
    all_neg_similarities = []
    
    for idx, test_sample in enumerate(test_samples):
        test_query = test_sample.get('query', '').strip()
        pos_test_skills = [skill.strip() for skill in test_sample.get('pos', []) if skill.strip()][:3]
        neg_test_skills = [skill.strip() for skill in test_sample.get('neg', []) if skill.strip()][:3]
        
        # Validate sample has required data
        if not test_query:
            print(f"\n‚ö†Ô∏è Test Sample {idx+1}: Empty query, skipping...")
            continue
            
        if not pos_test_skills:
            print(f"\n‚ö†Ô∏è Test Sample {idx+1}: No valid positive skills, skipping...")
            continue
            
        if not neg_test_skills:
            print(f"\n‚ö†Ô∏è Test Sample {idx+1}: No valid negative skills, skipping...")
            continue
        
        print(f"\nüìù Test Sample {idx+1}:")
        print(f"   Query: {test_query[:100]}{'...' if len(test_query) > 100 else ''}")
        print(f"   Positive skills: {len(pos_test_skills)}, Negative skills: {len(neg_test_skills)}")
        
        try:
            # Encode with E5 prefixes
            query_text = f"query: {test_query}"
            pos_skill_texts = [f"passage: {skill}" for skill in pos_test_skills]
            neg_skill_texts = [f"passage: {skill}" for skill in neg_test_skills]
            
            # Encode and validate embeddings
            query_embedding = model.encode([query_text])
            pos_skill_embeddings = model.encode(pos_skill_texts)
            neg_skill_embeddings = model.encode(neg_skill_texts)
            
            # Validate embedding shapes
            if query_embedding.shape[0] == 0:
                print(f"   ‚ùå Empty query embedding, skipping...")
                continue
            if pos_skill_embeddings.shape[0] == 0:
                print(f"   ‚ùå Empty positive skill embeddings, skipping...")
                continue
            if neg_skill_embeddings.shape[0] == 0:
                print(f"   ‚ùå Empty negative skill embeddings, skipping...")
                continue
            
            print(f"   üìê Embedding shapes: query{query_embedding.shape}, pos{pos_skill_embeddings.shape}, neg{neg_skill_embeddings.shape}")
            
            # Compute similarities
            pos_similarities = model.similarity(query_embedding, pos_skill_embeddings)[0]
            neg_similarities = model.similarity(query_embedding, neg_skill_embeddings)[0]
            
            # Store for overall analysis
            all_pos_similarities.extend(pos_similarities.tolist())
            all_neg_similarities.extend(neg_similarities.tolist())
            
            print(f"   üìä Positive skills (should be high similarity):")
            for i, (skill, score) in enumerate(zip(pos_test_skills, pos_similarities)):
                print(f"      {score:.4f} - {skill[:80]}{'...' if len(skill) > 80 else ''}")
            
            print(f"   üìä Negative skills (should be low similarity):")
            for i, (skill, score) in enumerate(zip(neg_test_skills, neg_similarities)):
                print(f"      {score:.4f} - {skill[:80]}{'...' if len(skill) > 80 else ''}")
                
        except Exception as e:
            print(f"   ‚ùå Error processing sample {idx+1}: {str(e)}")
            print(f"   üîç Query length: {len(test_query)}")
            print(f"   üîç Pos skills: {len(pos_test_skills)} items")
            print(f"   üîç Neg skills: {len(neg_test_skills)} items")
            continue
    
    # Overall performance analysis
    print(f"\nüìà OVERALL PERFORMANCE ANALYSIS:")
    print(f"=" * 40)
    
    if not all_pos_similarities or not all_neg_similarities:
        print("‚ùå No valid similarities computed!")
        print(f"   Positive similarities: {len(all_pos_similarities)}")
        print(f"   Negative similarities: {len(all_neg_similarities)}")
        print("üí° This suggests:")
        print("   - Training data may have empty or invalid entries")
        print("   - Model encoding may be failing")
        print("   - Check data format and model loading")
    else:
        import numpy as np
        
        pos_mean = np.mean(all_pos_similarities)
        pos_std = np.std(all_pos_similarities)
        neg_mean = np.mean(all_neg_similarities)
        neg_std = np.std(all_neg_similarities)
        
        separation = pos_mean - neg_mean
        
        print(f"‚úÖ Positive similarities: {pos_mean:.4f} ¬± {pos_std:.4f}")
        print(f"‚ùå Negative similarities: {neg_mean:.4f} ¬± {neg_std:.4f}")
        print(f"üìè Separation (higher is better): {separation:.4f}")
        
        # Quality assessment
        if separation > 0.2:
            quality = "üåü Excellent"
        elif separation > 0.1:
            quality = "‚úÖ Good"
        elif separation > 0.05:
            quality = "‚ö†Ô∏è Fair"
        else:
            quality = "‚ùå Poor"
        
        print(f"üéØ Model Quality: {quality}")
        
        # Ranking test
        print(f"\nüèÜ RANKING TEST:")
        print(f"Testing if positive skills rank higher than negative skills...")
        
        correct_rankings = 0
        total_comparisons = 0
        
        for i in range(len(all_pos_similarities)):
            for j in range(len(all_neg_similarities)):
                if all_pos_similarities[i] > all_neg_similarities[j]:
                    correct_rankings += 1
                total_comparisons += 1
        
        ranking_accuracy = correct_rankings / total_comparisons if total_comparisons > 0 else 0
        print(f"üéØ Ranking Accuracy: {ranking_accuracy:.3f} ({correct_rankings}/{total_comparisons})")
        
        if ranking_accuracy > 0.8:
            ranking_quality = "üåü Excellent"
        elif ranking_accuracy > 0.7:
            ranking_quality = "‚úÖ Good"
        elif ranking_accuracy > 0.6:
            ranking_quality = "‚ö†Ô∏è Fair"
        else:
            ranking_quality = "‚ùå Poor"
        
        print(f"üèÜ Ranking Quality: {ranking_quality}")
        
        # Recommendations
        print(f"\nüí° RECOMMENDATIONS:")
        if separation < 0.1:
            print("   üìà Consider increasing training epochs or learning rate")
            print("   üéØ Add more negative examples or harder negatives")
        if ranking_accuracy < 0.7:
            print("   üîÑ Try different loss functions (CoSENT, Triplet)")
            print("   üìä Increase batch size for better negative sampling")
        if pos_std > 0.3:
            print("   üìè High variance in positive similarities - check data quality")
    
else:
    print("‚ùå No test data available")
    print("üí° Suggestions:")
    print("   - Check if training data was loaded properly")
    print("   - Verify data format: should have 'query', 'pos', 'neg' fields")
    print("   - Ensure positive and negative skills are not empty lists")

In [None]:
import zipfile
from pathlib import Path
import tempfile
import shutil
from sentence_transformers import SentenceTransformer

# üì¶ Create downloadable model package (robust)
# Use the clean deployed model directory if available
output_dir = Path(globals().get('deployed_model_dir', Path('/kaggle/working/test_encoder_only_base_multilingual-e5-base_deployed')))
required_files = [
    'config.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'special_tokens_map.json',
    'sentencepiece.bpe.model',
    'model.safetensors',
    'pytorch_model.bin',
    'modules.json',
    'config_sentence_transformers.json',
    'sentence_bert_config.json'
]

if output_dir.exists():
    print("üì¶ Creating download package (including required files from deployed model)...")

    # Create zip file
    filename = f"finetuned_esco_model.zip"
    zip_filename = Path(f"/kaggle/working/{filename}")

    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk output directory and selectively add files
        for root, dirs, files in os.walk(output_dir):
            # Exclude checkpoint directories and logs
            dirs[:] = [d for d in dirs if not d.startswith('checkpoint-') and d != 'logs' and d != 'runs']

            for file in files:
                file_path = Path(root) / file
                rel = file_path.relative_to(output_dir)

                # Always include required files if present
                if file in required_files:
                    zipf.write(file_path, rel)
                    print(f"   + {rel} (required)")
                    continue

                # Also include model files under common dirs (e.g., pytorch_model-*.bin, *.safetensors)
                if file.endswith('.safetensors') or file.startswith('pytorch_model') or file.endswith('.bin') or file.endswith('.json'):
                    zipf.write(file_path, rel)
                    print(f"   + {rel}")
                    continue

                # Include tokenizer files and modules
                if 'tokenizer' in file.lower() or 'vocab' in file.lower() or 'module' in file.lower():
                    zipf.write(file_path, rel)
                    print(f"   + {rel}")
                    continue

        # Add README to zip
        usage_info = f"""# Fine-tuned ESCO Skill Retrieval Model

This archive contains the files required to load the model with SentenceTransformers/HuggingFace.

Required files included (when available): {', '.join([f for f in required_files])}

Loading example:

```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('path/to/extracted/model')
```

Note: If you see errors about missing tokenizer or config, ensure the extracted folder contains `config.json` and tokenizer files.
"""
        zipf.writestr('README.md', usage_info)

    # Check file size
    file_size = zip_filename.stat().st_size / 1e6  # MB

    print(f"\n‚úÖ Download package created: {zip_filename}")
    print(f"üìä Size: {file_size:.1f} MB")

    # --- Verification: extract to temp and try to load with SentenceTransformer
    print('\nüîé Verifying archive by extracting and loading model...')
    tmpdir = Path(tempfile.mkdtemp())
    try:
        with zipfile.ZipFile(zip_filename, 'r') as zipf:
            zipf.extractall(tmpdir)

        # Attempt to load
        try:
            test_model = SentenceTransformer(str(tmpdir))
            print('‚úÖ Verification load successful: SentenceTransformer can load the extracted archive')
        except Exception as e:
            print('‚ùå Verification load failed: ', e)
            print('Contents extracted to:', tmpdir)
            print('List of extracted files:')
            for p in tmpdir.rglob('*'):
                print('  ', p.relative_to(tmpdir))
            raise
    finally:
        # Clean up temporary extraction directory
        try:
            shutil.rmtree(tmpdir)
        except Exception:
            pass

else:
    print("‚ùå No deployed model found to package - run the cell that creates 'deployed_model_dir' first")


In [None]:
# Create download link
from IPython.display import FileLink
FileLink(filename)