In [27]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a custom environment
environment = Environment(name="lightgbm-environment")

# Add dependencies
conda_deps = CondaDependencies()
conda_deps.add_conda_package("lightgbm")  # Add LightGBM
conda_deps.add_pip_package("scikit-learn")  # Add scikit-learn
conda_deps.add_pip_package("azureml-core")  # Add Azure ML SDK
conda_deps.add_pip_package("pandas")  # Add pandas

# Add the dependencies to the environment
environment.python.conda_dependencies = conda_deps

# Register the environment (optional but recommended)
environment.register(ws)

{
    "assetId": "azureml://locations/qatarcentral/workspaces/5cff68aa-74ba-4160-a72d-a58499b4c19d/environments/lightgbm-environment/versions/2",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20240709.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "lightgbm-en

In [28]:
from azureml.core import ComputeTarget

# Retrieve the compute target
compute_target = ComputeTarget(workspace=ws, name="cpu-cluster")  # Replace with your compute target name

In [29]:
# Define the script to run
script_run_config = ScriptRunConfig(
    source_directory=".",
    script="train_lightgbm.py",  # Script to train LightGBM
    compute_target=compute_target,  # Use the compute target object
    environment=environment  # Use the custom environment
)

In [37]:
from azureml.core import Workspace, Experiment, Environment, ComputeTarget, ScriptRunConfig
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal, RandomParameterSampling, BanditPolicy, uniform, choice

# Load the workspace
ws = Workspace.from_config()

# Retrieve the compute target
compute_target = ComputeTarget(workspace=ws, name="cpu-cluster")

# Retrieve the custom environment
environment = Environment.get(ws, name="lightgbm-environment")

# Define the script to run
script_run_config = ScriptRunConfig(
    source_directory=".",  # Directory containing train_lightgbm.py
    script="train_lightgbm.py",  # Script to train LightGBM
    compute_target=compute_target,  # Use the compute target object
    environment=environment  # Use the custom environment
)

# Define the hyperparameter search space
param_sampling = RandomParameterSampling({
    "learning_rate": uniform(0.01, 0.1),
    "num_leaves": choice(20, 30, 40, 50),
    "max_depth": choice(5, 10, 15),
    "min_child_samples": choice(10, 20, 30),
    "reg_alpha": uniform(0, 1),
    "reg_lambda": uniform(0, 1)
})

# Define the early termination policy
early_termination_policy = BanditPolicy(
    slack_factor=0.1,
    evaluation_interval=1,
    delay_evaluation=5
)

# Configure HyperDrive
hyperdrive_config = HyperDriveConfig(
    run_config=script_run_config,  # Use the ScriptRunConfig
    hyperparameter_sampling=param_sampling,
    policy=early_termination_policy,
    primary_metric_name="AUC",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=4
)

# Submit the HyperDrive run
experiment = Experiment(ws, "lightgbm-hyperdrive")
hyperdrive_run = experiment.submit(hyperdrive_config)
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_c4e80891-78d1-4858-ad32-1e598a90cc8f
Web View: https://ml.azure.com/runs/HD_c4e80891-78d1-4858-ad32-1e598a90cc8f?wsid=/subscriptions/b2303599-e8d0-4d67-bdeb-dd94f4970c1d/resourcegroups/frauddetectionrg/workspaces/frauddetectionml&tid=4df09b9c-261e-4691-89d8-c6fcd95ee9f9

Streaming azureml-logs/hyperdrive.txt

[2025-02-08T00:04:07.6249964Z][GENERATOR][DEBUG]Sampled 4 jobs from search space 
[2025-02-08T00:04:08.0007649Z][SCHEDULER][INFO]Scheduling job, id='HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_0' 
[2025-02-08T00:04:08.0846435Z][SCHEDULER][INFO]Scheduling job, id='HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_2' 
[2025-02-08T00:04:08.0424397Z][SCHEDULER][INFO]Scheduling job, id='HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_1' 
[2025-02-08T00:04:08.1301388Z][SCHEDULER][INFO]Scheduling job, id='HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_3' 
[2025-02-08T00:04:09.3075129Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_0' 
[2025-02-08T00:04:09.3580805

{'runId': 'HD_c4e80891-78d1-4858-ad32-1e598a90cc8f',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2025-02-08T00:04:05.957466Z',
 'endTimeUtc': '2025-02-08T00:16:44.713344Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"AUC","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'f7cea96b-226f-41ba-a3f4-2dca54bf8361',
  'user_agent': 'python/3.10.11 (Linux-5.15.0-1073-azure-x86_64-with-glibc2.31) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.57.0',
  'best_child_run_id': 'HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_6',
  'score': '0.9834741754045456',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_6'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlClientType': 'azureml-sdk-train'

In [38]:
# Get the best run
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_metrics = best_run.get_metrics()
best_hyperparameters = best_run.get_details()['runDefinition']['arguments']

print("Best Run ID:", best_run.id)
print("Best AUC:", best_metrics["AUC"])
print("Best Hyperparameters:", best_hyperparameters)

Best Run ID: HD_c4e80891-78d1-4858-ad32-1e598a90cc8f_6
Best AUC: 0.9834741754045456
Best Hyperparameters: ['--learning_rate', '0.06084840317135841', '--max_depth', '5', '--min_child_samples', '30', '--num_leaves', '50', '--reg_alpha', '0.7466014547285096', '--reg_lambda', '0.726135067018163']


In [39]:
from azureml.core import Model

# Register the best model
best_run.register_model(
    model_name="lightgbm-fraud-detection",
    model_path="outputs/model.pkl",  # Path to the saved model
    description="Best LightGBM model for fraud detection",
    tags={"algorithm": "LightGBM", "metric": "AUC", "value": 0.9833}
)

print("Best model registered.")

Best model registered.


In [40]:
# List artifacts uploaded by the best run
artifacts = best_run.get_file_names()
print("Artifacts:", artifacts)

Artifacts: ['outputs/model.pkl', 'system_logs/cs_capability/cs-capability.log', 'system_logs/hosttools_capability/hosttools-capability.log', 'system_logs/lifecycler/execution-wrapper.log', 'system_logs/lifecycler/lifecycler.log', 'system_logs/metrics_capability/metrics-capability.log', 'system_logs/snapshot_capability/snapshot-capability.log', 'user_logs/std_log.txt']
