In [1]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt

import azureml
from azureml.core import Workspace, Dataset

print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.19.0


# Inicjacja przestrzeni roboczej

In [2]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: sentencecompletion
Azure region: westeurope
Subscription id: 5334814e-153c-45f0-9557-00aa820611b9
Resource group: SentenceCompletionAzure


# Utworzenie eksperymentu Azure ML

In [3]:
datastore = ws.get_default_datastore()
dataset = Dataset.get_by_name(ws, name='Korpus-segmentation')
mount_ctx0 = dataset.mount() 
mount_ctx0.start() 
dataset_mount_corpora = mount_ctx0.mount_point
print(dataset_mount_corpora)
corpora_files = os.listdir(dataset_mount_corpora)
print(corpora_files)

emb_dataset = Dataset.get_by_name(ws, name='word2vec-wiki-nkfp-polish')
mount_ctx1 = emb_dataset.mount()  
mount_ctx1.start() 
dataset_mount_word2vec = mount_ctx1.mount_point
print(dataset_mount_word2vec)
w2v_files = os.listdir(dataset_mount_word2vec)
print(w2v_files)

/tmp/tmpbfp5uxij
['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt']
/tmp/tmpc_irouvb
['nkjp+wiki-forms-all-100-cbow-hs.txt']


In [4]:
from azureml.core import Experiment


script_folder = './scripts'
os.makedirs(script_folder, exist_ok=True)

retrain_folder = './scripts/retrain'
os.makedirs(retrain_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='textgen-lstm')

# Utworzenie środowiska obliczeniowego

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ml-gpu-cl-lp"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 1, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-26T14:11:41.458000+00:00', 'errors': None, 'creationTime': '2021-01-25T23:57:12.617905+00:00', 'modifiedTime': '2021-01-25T23:57:28.296417+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'LowPriority', 'vmSize': 'STANDARD_NC6S_V3'}


In [6]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

pre-preprocessing ComputeInstance Succeeded
ml-cpu ComputeInstance Succeeded
ml-gpu-cl-lp AmlCompute Succeeded


# Kopiowanie danych treningowych do skryptu

In [7]:
import shutil

shutil.copy('./script.py', script_folder)


'./scripts/script.py'

In [8]:
with open(os.path.join(script_folder, './script.py'), 'r') as f:
    print(f.read())

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import normalize
from gensim import models

import numpy as np
import os
import string

from azureml.core import Run

import numpy as np
import os
import time
import glob
import string


parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point')
parser.add_argument('--model-folder', type=str, dest='model_folder', default=None, help='model to train')
parser.add_argument('--embed-folder', type=str, dest='embed_folder', default='embed', help='embed-folder mounting point')
parser.add_argument('--text-id', type=int, dest='text_id', default=1)
parser.add_argument('--text-size', type=int, dest='text_size',

# Utworzenie środwiska

In [9]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.9
- pip:
  - h5py<=2.10.0
  - azureml-defaults
  - tensorflow==2.3
  - matplotlib
  - gensim

Overwriting conda_dependencies.yml


In [10]:
from azureml.core import Environment

textgen_env = Environment.from_conda_specification(name = 'textgen-lstm', file_path = './conda_dependencies.yml')

# Specify a GPU base image
textgen_env.docker.enabled = True
textgen_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'

#textgen_env = Environment.get(ws, name='AzureML-TensorFlow-2.3-GPU')

# Konfiguracja treningu

In [11]:
from azureml.core import ScriptRunConfig

args = ['--data-folder', dataset.as_named_input('text').as_mount(),
        '--embed-folder', emb_dataset.as_named_input('emb').as_mount(),
        '--text-id', 10,
        '--text-size', 30000000,
        '--epochs', 1]

src = ScriptRunConfig(source_directory=script_folder,
                      script='script.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=textgen_env)

# Zatwierdzenie zadania do uruchomienia

In [12]:
run = exp.submit(src)

In [13]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [14]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
textgen-lstm,textgen-lstm_1611696706_5b26c855,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [15]:
run.wait_for_completion(show_output=True)

RunId: textgen-lstm_1611696706_5b26c855
Web View: https://ml.azure.com/experiments/textgen-lstm/runs/textgen-lstm_1611696706_5b26c855?wsid=/subscriptions/5334814e-153c-45f0-9557-00aa820611b9/resourcegroups/SentenceCompletionAzure/workspaces/sentencecompletion

Streaming azureml-logs/55_azureml-execution-tvmps_411b773d49147460e0ef990349bec4329fd37252ef7b5a8397e7ba564238b8ad_p.txt

2021-01-26T21:39:15Z Starting output-watcher...
2021-01-26T21:39:15Z IsDedicatedCompute == False, starting polling for Low-Pri Preemption

Streaming azureml-logs/70_driver_log.txt

2021/01/26 21:39:41 Attempt 1 of http call to http://10.0.0.5:16384/sendlogstoartifacts/info
2021/01/26 21:39:41 Attempt 1 of http call to http://10.0.0.5:16384/sendlogstoartifacts/status
[2021-01-26T21:39:42.809946] Entering context manager injector.
[context_manager_injector.py] Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'Dataset:context_managers.Datasets', 'RunHistory:context_m

In [None]:
run.get_details()

In [None]:
run.get_metrics()

In [None]:
run.get_file_names()

In [None]:
from azureml.core import Experiment, Run

ws = Workspace.from_config()

exp = Experiment(workspace=ws, name='textgen')

run = Run(exp, 'textgen_1611390281_8b2cf5c2')

# create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/'):
        print(f.replace('outputs/model/', ""))
        output_path = os.path.join('./model/', f.replace('outputs/model/', ""))
        print('Downloading from {} to {} ...'.format(f, output_path))
        run.download_file(name=f, output_file_path=output_path)

In [None]:
one_step_reloaded = tf.saved_model.load('model/one_step_model')

In [None]:
states = None
next_char = tf.constant(['pan to])
result = [next_char]

for n in range(110):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)