In [3]:
from pydantic import BaseModel
from typing import Any, Literal, TYPE_CHECKING, TypedDict

if TYPE_CHECKING:
    from trl import GRPOConfig
    from vllm.engine.arg_utils import AsyncEngineArgs

    from art import types


def get_model_config(
    base_model: "types.BaseModel", config: "ModelConfig | None"
) -> "ModelConfig":
    if config is None:
        config = ModelConfig()
    config.init_args = InitArgs(
        model_name=base_model,
        max_seq_length=8192,
        load_in_4bit=True,  # False for LoRA 16bit
        fast_inference=True,  # Enable vLLM fast inference
        # vLLM args
        disable_log_requests=True,
        disable_log_stats=False,
        enable_prefix_caching=True,
        gpu_memory_utilization=0.62,  # Reduce if out of memory
        max_lora_rank=32,
        num_scheduler_steps=16,
        use_async=True,
        **(config.init_args or {}),
    )
    return config


class ModelConfig(BaseModel):
    """
    Model configuration.

    Args:
        init: Arguments for initializing an Unsloth FastLanguageModel.
        peft: Arguments for creating an Unsloth PEFT model wrapper.
        train: Arguments for training the model.
    """

    init_args: "InitArgs | None" = None
    peft_args: "PeftArgs | None" = None
    # train_args: "GRPOConfig | None" = None


class OpenAIServerConfig(BaseModel):
    """
    Server configuration.

    Args:
        server_args: Arguments for the vLLM OpenAI-compatible server.
        engine_args: Additional vLLM engine arguments for the OpenAI-compatible server.
                     Note that since the vLLM engine is initialized with Unsloth,
                     these additional arguments will only have an effect if the
                     OpenAI-compatible server uses them elsewhere.
    """

    server_args: "ServerArgs | None" = None
    engine_args: "AsyncEngineArgs | None" = None


class InitArgs(TypedDict, total=False):
    model_name: str
    max_seq_length: int
    dtype: str | None
    load_in_4bit: bool
    load_in_8bit: bool
    full_finetuning: bool
    token: str | None
    device_map: str
    rope_scaling: dict | None
    fix_tokenizer: bool
    trust_remote_code: bool
    use_gradient_checkpointing: str
    resize_model_vocab: int | None
    revision: str | None
    use_exact_model_name: bool
    fast_inference: bool
    gpu_memory_utilization: float
    float8_kv_cache: bool
    random_state: int
    max_lora_rank: int
    disable_log_requests: bool
    disable_log_stats: bool
    enable_prefix_caching: bool
    num_scheduler_steps: int
    use_async: bool


class PeftArgs(TypedDict, total=False):
    r: int
    target_modules: list[str]
    lora_alpha: int
    lora_dropout: float
    bias: str
    layers_to_transform: list[int] | None
    layers_pattern: str | None
    use_gradient_checkpointing: bool | str
    random_state: int
    max_seq_length: int  # not used anymore
    use_rslora: bool
    modules_to_save: list[str] | None
    init_lora_weights: bool
    loftq_config: dict
    temporary_location: str

get_model_config("Qwen/Qwen2.5-14B-Instruct", ModelConfig(init_args=InitArgs(model_name="Qwen/Qwen2.5-14B-Instruct")))

TypeError: __main__.InitArgs() got multiple values for keyword argument 'model_name'

In [3]:
from dataclasses import dataclass
from pydantic import BaseModel


@dataclass
class EngineArgs:
    params: list[str]

class ServerConfig(BaseModel):
    engine_args: EngineArgs


ServerConfig.model_validate(ServerConfig(engine_args=EngineArgs(params=["--host", "0.0.0.0", "--port", "8000"])).model_dump())

ServerConfig(engine_args=EngineArgs(params=['--host', '0.0.0.0', '--port', '8000']))

In [None]:
from mp_actors import move_to_child_process
import asyncio


class Service:
    async def load_unsloth(self) -> None:
        import unsloth

    async def greet(self, name: str, sleep: float) -> str:
        await asyncio.sleep(sleep)
        return f"Hello, {name}!"

    def raise_error(self) -> None:
        raise ValueError("This is a test error")


service = Service()
service = move_to_child_process(service)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-29 18:20:23 __init__.py:207] Automatically detected platform cuda.


In [2]:
await service.load_unsloth()
await service.greet("World", 1.0)

'Hello, World!'