# BGE-M3 Korean Embedding Fine-tuning (Azure ML Studio)

이 노트북은 합성 데이터 생성 → 데이터 자산 등록 → 실험/실행 제출 → 모델 등록 → 엔드포인트 배포를 단계적으로 수행합니다.

필수 환경 변수:
- AZURE_SUBSCRIPTION_ID
- AZURE_RESOURCE_GROUP
- AZUREML_WORKSPACE_NAME
- AZUREML_COMPUTE_NAME (예: cpu-cluster)

In [2]:
import sys
from pathlib import Path

# Add parent directory to path to import src module
notebook_dir = Path().resolve()
project_root = notebook_dir.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import os

from azure.ai.ml import MLClient, command, Input
from azure.ai.ml.entities import Data, AmlCompute, ManagedOnlineEndpoint, ManagedOnlineDeployment, Model
from azure.identity import DefaultAzureCredential

from src.synth_data import read_terms, synthesize_examples, save_jsonl

from dotenv import load_dotenv

load_dotenv()

subscription_id = os.environ.get('AZURE_SUBSCRIPTION_ID')
resource_group = os.environ.get('AZURE_RESOURCE_GROUP')
workspace_name = os.environ.get('AZUREML_WORKSPACE_NAME')
compute_name = os.environ.get('AZUREML_COMPUTE_NAME', 'cpu-cluster')

assert subscription_id and resource_group and workspace_name, '환경 변수를 확인하세요.'


## 1) 합성 데이터 생성

In [3]:
terms = read_terms(Path('../data/terms_electronics_ko.txt'))
records = synthesize_examples(terms, n_pairs_per_term=6, seed=42)
output_path = Path('../data/synthetic_electronics.jsonl')
save_jsonl(records, output_path)
len(records)

966

## 2) Azure ML Client 생성

In [15]:
# Azure CLI 경로를 명시적으로 설정하여 인증 문제 해결
import os
import subprocess
import shutil

# Azure CLI 경로가 PATH에 있는지 확인하고 추가
az_cli_path = '/opt/homebrew/bin'
if az_cli_path not in os.environ.get('PATH', ''):
    os.environ['PATH'] = f"{az_cli_path}:{os.environ.get('PATH', '')}"
    print(f"Added {az_cli_path} to PATH")

# Azure CLI 접근 가능 여부 확인
az_location = shutil.which('az')
print(f"Azure CLI found at: {az_location}")

if az_location:
    # Azure CLI를 통해 인증 상태 확인
    result = subprocess.run(['az', 'account', 'show'], capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ Azure CLI authentication is active")
    else:
        print("✗ Azure CLI not authenticated. Run 'az login' in terminal")

# 이제 AzureCliCredential 임포트
from azure.identity import AzureCliCredential

credential = AzureCliCredential()

ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)
print(f"✓ ML Client created for workspace: {workspace_name}")

Azure CLI found at: /opt/homebrew/bin/az
✓ Azure CLI authentication is active


Overriding of current MeterProvider is not allowed
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


✓ ML Client created for workspace: andyaml


## 3) 데이터 자산 등록

In [6]:
data_asset = Data(
    name='bge-m3-kr-synth-train',
    version='4',
    type='uri_file',
    path=str(output_path),
)
data_asset = ml_client.data.create_or_update(data_asset)
data_asset

Data({'path': 'azureml://subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourcegroups/aistudio/workspaces/andyaml/datastores/workspaceblobstore/paths/LocalUpload/938ff3b466df609131a62c73e91ee2981ecb00d27a43a3d5156093cdc8a824a8/synthetic_electronics.jsonl', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'bge-m3-kr-synth-train', 'description': None, 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourceGroups/aistudio/providers/Microsoft.MachineLearningServices/workspaces/andyaml/data/bge-m3-kr-synth-train/versions/4', 'Resource__source_path': '', 'base_path': '/Users/andy/works/ai/finetuning-embedding/notebooks', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x11b79d610>, 'serialize': <msrest.serialization.Serializer object at 0x11b79e8d0>, 've

## 4) 컴퓨트 클러스터 준비

In [17]:
try:
    ml_client.compute.get(compute_name)
    print('Compute exists:', compute_name)
except Exception:
    compute = AmlCompute(name=compute_name, size='Standard_DS3_v2', min_instances=0, max_instances=2)
    ml_client.compute.begin_create_or_update(compute).result()
    print('Compute created:', compute_name)

Compute exists: clustercpu


## 5) 실험(Experiment) 및 실행(Run) 제출

In [28]:
from azure.ai.ml.entities import Environment

custom_env = Environment(
    name="bge-m3-finetune-env",
    version="1",
    description="Custom environment for BGE-M3 Korean embedding fine-tuning",
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    conda_file={
        "name": "bge-m3-finetune",
        "channels": ["pytorch", "conda-forge", "defaults"],
        "dependencies": [
            "python=3.10",
            "pip",
            {"pip": [
                "sentence-transformers>=2.5.0",
                "torch>=2.1.0",
                "datasets>=2.18.0",
                "accelerate>=1.1.0",
                "mlflow>=2.10.0",
                "numpy>=1.26.0",
                "scikit-learn>=1.3.0",
                "tqdm>=4.66.0",
            ]}
        ]
    }
)

# Register the environment first
registered_env = ml_client.environments.create_or_update(custom_env)
print(f"✓ Environment registered: {registered_env.name}:{registered_env.version}")
print(f"  ID: {registered_env.id}")

job = command(
    display_name='bge-m3-kr-finetune',
    experiment_name='bge-m3-kr-finetune',
    code='../.',
    command='python -m src.train --train_data ${{inputs.train_data}} --output_dir ${{outputs.model_output}} --epochs 1 --batch_size 16 --lr 2e-5 --max_seq_length 256',
    inputs={
        'train_data': Input(type='uri_file', path=data_asset.id),
    },
    outputs={
        'model_output': {'type': 'uri_folder'},
    },
    environment=registered_env.id,
    compute=compute_name,
)
print("Job environment:", job.environment)
created_job = ml_client.jobs.create_or_update(job)
created_job

✓ Environment registered: bge-m3-finetune-env:1
  ID: /subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourceGroups/aistudio/providers/Microsoft.MachineLearningServices/workspaces/andyaml/environments/bge-m3-finetune-env/versions/1
Job environment: /subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourceGroups/aistudio/providers/Microsoft.MachineLearningServices/workspaces/andyaml/environments/bge-m3-finetune-env/versions/1


Uploading finetuning-embedding (2.36 MBs): 100%|██████████| 2359412/2359412 [00:04<00:00, 512728.95it/s] 


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Experiment,Name,Type,Status,Details Page
bge-m3-kr-finetune,tough_potato_nfnnkv0hn7,command,Starting,Link to Azure Machine Learning studio


In [27]:
ml_client.jobs.stream(created_job.name)

RunId: placid_battery_17md7gl357
Web View: https://ml.azure.com/runs/placid_battery_17md7gl357?wsid=/subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourcegroups/aistudio/workspaces/andyaml

Execution Summary
RunId: placid_battery_17md7gl357
Web View: https://ml.azure.com/runs/placid_battery_17md7gl357?wsid=/subscriptions/e0493f49-bc5c-4207-a643-9b5f6503a36d/resourcegroups/aistudio/workspaces/andyaml


JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Job submission to cjpvm encountered an Exception with status code BadRequest, Compute instance is not in running status, please start compute before submit a job",
        "message_format": "Job submission to {Component} encountered an Exception with status code {ErrorCode}, {Message}",
        "message_parameters": {
            "Component": "cjpvm",
            "ErrorCode": "BadRequest",
            "Message": "Compute instance is not in running status, please start compute before submit a job"
        },
        "details": [],
        "inner_error": {
            "code": "BadArgument",
            "inner_error": {
                "code": "BadJobSubmissionRequestError"
            }
        }
    },
    "environment": "japaneast",
    "location": "japaneast",
    "time": "2026-02-10T07:37:19.112262Z",
    "component_name": "cjpvm"
} 

## 6) 모델 등록

In [None]:
model = Model(
    name='bge-m3-kr-embedding-model',
    path=created_job.outputs['model_output'],
    type='custom_model',
)
registered_model = ml_client.models.create_or_update(model)
registered_model

## 7) 엔드포인트 및 배포

In [None]:
endpoint = ManagedOnlineEndpoint(
    name='bge-m3-kr-embeddings',
    auth_mode='key',
)
ml_client.begin_create_or_update(endpoint).result()

deployment = ManagedOnlineDeployment(
    name='blue',
    endpoint_name=endpoint.name,
    model=registered_model.id,
    # Azure ML 큐레이티드 환경 사용 (레지스트리 경로 형식)
    environment='azureml://registries/azureml/environments/AzureML-pytorch-2.1-ubuntu20.04-py310-cpu/labels/latest',
    instance_type='Standard_DS3_v2',
    instance_count=1,
    code_configuration={
        'code': '../.',
        'scoring_script': 'src/score.py',
    },
)
ml_client.begin_create_or_update(deployment).result()

ml_client.online_endpoints.begin_traffic_update(
    name=endpoint.name, traffic={'blue': 100}
).result()

KeyboardInterrupt: 

## 8) 엔드포인트 테스트

배포된 엔드포인트에 샘플 요청을 보내 정상 작동을 확인합니다.

In [None]:
import json

# 배포 디렉토리 생성
deploy_dir = Path('deploy')
deploy_dir.mkdir(exist_ok=True)

# 샘플 요청 데이터 생성 (임베딩 모델용)
sample_request = {
    "inputs": {
        "data": [
            "전자제품 추천 부탁드립니다",
            "노트북 구매 고려중입니다",
            "스마트폰 배터리 수명이 궁금합니다"
        ]
    }
}

# JSON 파일로 저장
sample_request_path = deploy_dir / 'sample-request.json'
with open(sample_request_path, 'w', encoding='utf-8') as f:
    json.dump(sample_request, f, ensure_ascii=False, indent=2)

print(f"샘플 요청 파일 생성: {sample_request_path}")
print(json.dumps(sample_request, ensure_ascii=False, indent=2))

In [None]:
# 엔드포인트 호출 테스트
response = ml_client.online_endpoints.invoke(
    endpoint_name=endpoint.name,
    request_file=str(sample_request_path),
    deployment_name='blue'
)

print("응답 결과:")
print(response)

## 9) 리소스 정리

비용 절감을 위해 사용하지 않는 리소스를 정리합니다.

In [None]:
# 엔드포인트 삭제 (필요시 주석 해제)
# ml_client.online_endpoints.begin_delete(name=endpoint.name).result()
# print(f"엔드포인트 '{endpoint.name}' 삭제 완료")

# 컴퓨팅 클러스터는 자동으로 스케일 다운되므로 별도 삭제 불필요
# 하지만 완전히 삭제하려면:
# ml_client.compute.begin_delete(compute_name).result()
# print(f"컴퓨팅 클러스터 '{compute_name}' 삭제 완료")