In [1]:
import os
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import json
from typing import Optional, List

def download_embedding_models(
    model_count: int = 1,
    save_directory: str = r"D:\Code AI Before Sleep\Model",
    force_download: bool = False,
    use_auth_token: Optional[str] = None,
    custom_models: Optional[List[str]] = None
) -> List[str]:
    """
    Download embedding models from Hugging Face based on count specified.
    
    Args:
        model_count (int): Number of models to download (1-10)
        save_directory (str): Directory to save the models
        force_download (bool): Whether to force re-download even if model exists
        use_auth_token (str, optional): Hugging Face authentication token
        custom_models (List[str], optional): Custom list of models to choose from
        
    Returns:
        List[str]: List of paths to downloaded models
    """
    
    # Default popular embedding models (ordered by popularity/usefulness)
    default_models = [
        "sentence-transformers/all-MiniLM-L6-v2",           # 1. Fast and efficient
        "sentence-transformers/all-mpnet-base-v2",          # 2. High quality
        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",  # 3. Multilingual
        "sentence-transformers/all-distilroberta-v1",       # 4. Good balance
        "sentence-transformers/all-MiniLM-L12-v2",          # 5. Better than L6
        "sentence-transformers/multi-qa-mpnet-base-dot-v1", # 6. Q&A optimized
        "sentence-transformers/paraphrase-MiniLM-L6-v2",    # 7. Paraphrase detection
        "sentence-transformers/msmarco-distilbert-base-v4", # 8. Search optimized
        "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",  # 9. Multi-domain Q&A
        "sentence-transformers/all-roberta-large-v1"        # 10. Large model
    ]
    
    # Use custom models if provided, otherwise use default
    available_models = custom_models if custom_models else default_models
    
    # Validate model count
    if model_count < 1:
        model_count = 1
    elif model_count > len(available_models):
        model_count = len(available_models)
        print(f"⚠️  Requested {model_count} models, but only {len(available_models)} available. Downloading all.")
    
    # Select models to download
    models_to_download = available_models[:model_count]
    
    print(f"📥 Downloading {model_count} embedding model(s):")
    for i, model in enumerate(models_to_download, 1):
        print(f"  {i}. {model}")
    print()
    
    # Create save directory
    save_path = Path(save_directory)
    save_path.mkdir(parents=True, exist_ok=True)
    
    downloaded_paths = []
    successful_downloads = 0
    
    for i, model_name in enumerate(models_to_download, 1):
        try:
            print(f"[{i}/{model_count}] Processing: {model_name}")
            
            # Create model-specific directory
            model_dir_name = model_name.replace("/", "_").replace("\\", "_")
            model_save_path = save_path / model_dir_name
            
            # Check if model already exists
            if model_save_path.exists() and not force_download:
                print(f"  ✓ Model already exists, skipping download")
                downloaded_paths.append(str(model_save_path))
                successful_downloads += 1
                continue
            
            model_save_path.mkdir(parents=True, exist_ok=True)
            
            # Download tokenizer
            print(f"  📄 Downloading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                token=use_auth_token,
                trust_remote_code=True,
                cache_dir=str(model_save_path)
            )
            tokenizer.save_pretrained(model_save_path)
            
            # Download model
            print(f"  🤖 Downloading model...")
            model = AutoModel.from_pretrained(
                model_name,
                token=use_auth_token,
                trust_remote_code=True,
                cache_dir=str(model_save_path)
            )
            model.save_pretrained(model_save_path)
            
            # Save basic model info
            model_info = {
                "model_name": model_name,
                "model_path": str(model_save_path),
                "tokenizer_vocab_size": tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else None
            }
            
            with open(model_save_path / "model_info.json", "w") as f:
                json.dump(model_info, f, indent=2)
            
            print(f"  ✅ Successfully downloaded to: {model_save_path}")
            downloaded_paths.append(str(model_save_path))
            successful_downloads += 1
            
        except Exception as e:
            print(f"  ❌ Error downloading {model_name}: {str(e)}")
            # Clean up partial download
            if 'model_save_path' in locals() and model_save_path.exists():
                import shutil
                shutil.rmtree(model_save_path)
            continue
    
    print(f"\n🎉 Download Summary:")
    print(f"  ✅ Successfully downloaded: {successful_downloads}/{model_count} models")
    print(f"  📁 Saved to: {save_directory}")
    
    if downloaded_paths:
        print(f"  📋 Downloaded models:")
        for path in downloaded_paths:
            model_name = Path(path).name.replace("_", "/")
            print(f"    - {model_name}")
    
    return downloaded_paths

# Example usage
if __name__ == "__main__":
    # Download 1 model
    # download_embedding_models(model_count=1)
    
    # Download 2 models
    # download_embedding_models(model_count=2)
    
    # Download 5 models
    # download_embedding_models(model_count=5)

    # ເຈົ້າສາມາດເລືອກຮູບແບບການດາວໂຫລດໄດ້ ເຈົ້າອາດຈະ ສອງ model ຂຶ້ນໄປ ເຈົ້າກໍແຄ່ກຳໜົດຊື່ໃນ array ລະປ່ຽນ mode_count ເປັນຈຳນວນທີ່ຕ້ອງການດາວໂຫລດ
    
    # Download custom models
    custom_list = [
        "cross-encoder/ms-marco-TinyBERT-L6", 
        # "cross-encoder/ms-marco-MiniLM-L12-v2", 
        # "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 
        # "sentence-transformers/all-mpnet-base-v2"
    ]
    hf_token= os.getenv("HuggingFaceToken")
    download_embedding_models(model_count=1, custom_models=custom_list, use_auth_token=hf_token)

📥 Downloading 1 embedding model(s):
  1. cross-encoder/ms-marco-TinyBERT-L6

[1/1] Processing: cross-encoder/ms-marco-TinyBERT-L6
  📄 Downloading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

  🤖 Downloading model...


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  ✅ Successfully downloaded to: D:\Code AI Before Sleep\Model\cross-encoder_ms-marco-TinyBERT-L6

🎉 Download Summary:
  ✅ Successfully downloaded: 1/1 models
  📁 Saved to: D:\Code AI Before Sleep\Model
  📋 Downloaded models:
    - cross-encoder/ms-marco-TinyBERT-L6
