# Notebook de contrôle d'entraînement sur Azure ML
##### Stockage des données sur cloud effectué en amont 

### Lancement d'une session Azure, ouverture du workspace et création d'une expérience et d'une ressource de calcul

In [None]:
cd C:\\Users\\Lewin\\Downloads\\OC\\Projet_8\\

In [None]:
import azureml.core
print(azureml.core.VERSION)

In [None]:
from azureml.core import Workspace
ws=Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

In [None]:
from azureml.core import Experiment
experiment_name = 'HRnet-final'
exp = Experiment(workspace=ws, name=experiment_name)

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Cluster definition
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "gpucluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# VM definition - STANDARD_NC6 (GPU VM) had a promotional offer at project time
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6_Promo")

# Check if a compute is available before creating one
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # Wait for cluster creation to be complete
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Print compute details
    print(compute_target.get_status().serialize())

In [None]:
ds = ws.get_default_datastore()
print(ds)

In [None]:
# for n,items in ws.datastores.items():
#     print(items)

Ligne d'upload du dataset - à garder en commentaire une fois les données envoyées une fois : 

In [None]:
# ds.upload(src_dir="C:\\Users\\Lewin\\Downloads\\OC\\Projet_8\\data", target_path='cityscapes_data', overwrite=True, show_progress=True)

In [None]:
ds.path('./cityscapes_data').as_mount()

### Création du script d'entraînement et exécution d'un entraînement

In [None]:
import os
script_folder = 'C:\\Users\\Lewin\\Downloads\\OC\\Projet_8\\Azure_scripts'
os.makedirs(script_folder, exist_ok=True)

In [None]:
script_folder

### Version ancienne - classe Estimator

In [None]:
from azureml.train.estimator import Estimator


script_args = {'--workspace': ws.name,
               '--datafolder': ds.path('cityscapes_data/').as_mount(),
               '--epochs': 100
               }
est = Estimator(source_directory = script_folder,
               script_params=script_args,
               compute_target=compute_target,
               entry_script='HRnet_training_script.py',
               pip_packages=['pandas','tensorflow-gpu','numpy','matplotlib', 'opencv-python', 'tqdm'])


In [None]:
# Création d'un run avant exécution
run = exp.submit(config=est)


### Version la plus récente et multiworker : classe ScriptRunConfig : 

In [None]:
from azureml.core import ScriptRunConfig, Environment, Experiment, Dataset
from azureml.core.runconfig import TensorflowConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core.graph import PipelineParameter
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

curated_env_name = 'AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu'
tf_env = Environment.get(workspace=ws,name=curated_env_name).clone('tf_env')
distr_config = TensorflowConfiguration(worker_count=1, parameter_server_count=1)

conda_dep = CondaDependencies()
conda_dep.add_pip_package('numpy')
conda_dep.add_pip_package('pandas')
conda_dep.add_pip_package('matplotlib')
conda_dep.add_pip_package('tqdm')
tf_env.python.conda_dependencies=conda_dep

# Adds dependencies to PythonSection of myenv
dataset = Dataset.get_by_name(ws, name='data')
file_pipeline_param = PipelineParameter(name="file_ds_param", default_value=dataset)
dataset_input = DatasetConsumptionConfig("input_1", file_pipeline_param).as_mount()
datastore = ws.get_default_datastore()
data_ref = datastore.path('cityscapes_data').as_mount()

argslist = ['--workspace', ws.name, '--datafolder', str(data_ref), '--epochs', 100]

multi_config = ScriptRunConfig(
source_directory = script_folder,
arguments = argslist,
script = 'HRnet_training_script.py',
compute_target = compute_target,
environment = tf_env,
distributed_job_config = distr_config)
multi_config.run_config.data_references = {data_ref.data_reference_name: data_ref.to_config()}

### Exécution de l'experiment : 

In [None]:
run = exp.submit(config=multi_config)

In [None]:
# Pour raccrocher le run en cas de coupure réseau pendant l'exécution : 
# from azureml.core.run import get_run
# run_id = 'vanilla_unet_v1_1628877112_860e5a75'
# run = get_run(exp, run_id, rehydrate = True)

In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)


### Enregistrement du modèle et suppression du cluster de calcul: 

In [None]:
model = run.register_model(model_name='hrnet', model_path='outputs/hrnet')

In [None]:
compute_target.delete()