In [5]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt

import azureml
from azureml.core import Workspace, Dataset

print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.19.0


# Inicjacja przestrzeni roboczej

In [6]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: deep-learning
Azure region: westeurope
Subscription id: fc814e44-1cd5-4ab5-944b-f2255f816d34
Resource group: text-generation


# Utworzenie eksperymentu Azure ML

In [7]:
from azureml.core import Experiment

script_folder = './scripts'
os.makedirs(script_folder, exist_ok=True)

retrain_folder = './scripts/retrain'
os.makedirs(retrain_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='textgen')

In [4]:
datastore = ws.get_default_datastore()
dataset = Dataset.File.from_files(path=(datastore, 'Corpus/01-17-2021_104148_UTC/'))

mount_ctx = dataset.mount()  
mount_ctx.start() 

dataset_mount_folder = mount_ctx.mount_point
print(dataset_mount_folder)

files = os.listdir(dataset_mount_folder)
print(files)

/tmp/tmp6v_pyv7t
['corpus_text_12000000.txt', 'corpus_text_16000000.txt', 'corpus_text_20000000.txt', 'corpus_text_24000000.txt', 'corpus_text_28000000.txt', 'corpus_text_32000000.txt', 'corpus_text_36000000.txt', 'corpus_text_4000000.txt', 'corpus_text_40000000.txt', 'corpus_text_8000000.txt']


# Utworzenie środowiska obliczeniowego

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ml-gpu-cl-lp"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-23T08:08:39.125000+00:00', 'errors': None, 'creationTime': '2021-01-23T00:55:16.611071+00:00', 'modifiedTime': '2021-01-23T01:26:38.110154+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'LowPriority', 'vmSize': 'STANDARD_NC6'}


In [6]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

prosze ComputeInstance Succeeded
ml-gpu-cl-lp AmlCompute Succeeded
ciebietezprosze ComputeInstance Failed


# Kopiowanie danych treningowych do skryptu

In [7]:
import shutil

shutil.copy('./script.py', script_folder)
try:
    shutil.copytree('./model/model', script_folder)
except:
    pass

In [8]:
with open(os.path.join(script_folder, './script.py'), 'r') as f:
    print(f.read())

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

from azureml.core import Run

import numpy as np
import os
import time
import glob
import string


parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point')
parser.add_argument('--model-folder', type=str, dest='model_folder', default=None, help='model to train')
parser.add_argument('--text-id', type=int, dest='text_id', default=1)
parser.add_argument('--text-size', type=int, dest='text_size', default=10000)
parser.add_argument('--epochs', type=int, dest='epochs', default=1)
args = parser.parse_args()

data_folder = args.data_folder
print('training dataset is stored here:', data_folder)
files = os.listdir(data_folder)

model = None
model_folder = args.model_folder
if model_folder is not None:
  print('mod

# Utworzenie środwiska

In [9]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.9
- pip:
  - h5py<=2.10.0
  - azureml-defaults
  - tensorflow-gpu
  - tensorflow==2.4.1
  - matplotlib

Overwriting conda_dependencies.yml


In [10]:
from azureml.core import Environment

textgen_env = Environment.from_conda_specification(name = 'texgen', file_path = './conda_dependencies.yml')

# Specify a GPU base image
textgen_env.docker.enabled = True
textgen_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'

#textgen_env = Environment.get(ws, name='AzureML-TensorFlow-2.3-GPU')

# Konfiguracja treningu

In [11]:
!pip install tensorflow==2.4.1
import tensorflow as tf
tf.__version__



'2.4.1'

In [12]:
from azureml.core import ScriptRunConfig

args = ['--data-folder', dataset.as_named_input('text').as_mount(),
        '--text-id', 3,
        #'--text-size', 10000,
        '--epochs', 1]

src = ScriptRunConfig(source_directory=script_folder,
                      script='script.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=textgen_env)

# Zatwierdzenie zadania do uruchomienia

In [13]:
run = exp.submit(src)

In [14]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [15]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
textgen,textgen_1611390281_8b2cf5c2,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [16]:
run.wait_for_completion(show_output=True)

RunId: textgen_1611390281_8b2cf5c2
Web View: https://ml.azure.com/experiments/textgen/runs/textgen_1611390281_8b2cf5c2?wsid=/subscriptions/fc814e44-1cd5-4ab5-944b-f2255f816d34/resourcegroups/text-generation/workspaces/deep-learning

Streaming azureml-logs/65_job_prep-tvmps_89f7c5c27dc8af3c5cd22f248b43eae6fb116213c72b55b4d92fb9faa83b8983_p.txt

[2021-01-23T08:29:06.185484] Entering job preparation.
[2021-01-23T08:29:06.895446] Starting job preparation.
[2021-01-23T08:29:06.895494] Extracting the control code.
[2021-01-23T08:29:06.924709] fetching and extracting the control code on master node.
[2021-01-23T08:29:06.924741] Starting extract_project.
[2021-01-23T08:29:06.924777] Starting to extract zip file.
[2021-01-23T08:29:07.845455] Finished extracting zip file.
[2021-01-23T08:29:08.032369] Using urllib.request Python 3.0 or later
[2021-01-23T08:29:08.032445] Start fetching snapshots.
[2021-01-23T08:29:08.032495] Start fetching snapshot.
[2021-01-23T08:29:08.032515] Retrieving project 

In [None]:
run.get_details()

In [None]:
run.get_metrics()

In [None]:
run.get_file_names()

In [9]:
from azureml.core import Experiment, Run

ws = Workspace.from_config()

exp = Experiment(workspace=ws, name='textgen')

run = Run(exp, 'textgen_1611390281_8b2cf5c2')

# create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/'):
        print(f.replace('outputs/model/', ""))
        output_path = os.path.join('./model/', f.replace('outputs/model/', ""))
        print('Downloading from {} to {} ...'.format(f, output_path))
        run.download_file(name=f, output_file_path=output_path)

model/saved_model.pb
Downloading from outputs/model/model/saved_model.pb to ./model/model/saved_model.pb ...
model/variables/variables.data-00000-of-00001
Downloading from outputs/model/model/variables/variables.data-00000-of-00001 to ./model/model/variables/variables.data-00000-of-00001 ...
model/variables/variables.index
Downloading from outputs/model/model/variables/variables.index to ./model/model/variables/variables.index ...
one_step_model/saved_model.pb
Downloading from outputs/model/one_step_model/saved_model.pb to ./model/one_step_model/saved_model.pb ...
one_step_model/variables/variables.data-00000-of-00001
Downloading from outputs/model/one_step_model/variables/variables.data-00000-of-00001 to ./model/one_step_model/variables/variables.data-00000-of-00001 ...
one_step_model/variables/variables.index
Downloading from outputs/model/one_step_model/variables/variables.index to ./model/one_step_model/variables/variables.index ...
outputs/training_checkpoints/checkpoint
Downloadi



ValueError: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * Tensor("inputs:0", shape=(1,), dtype=string)
    * None
  Keyword arguments: {'states1': None, 'states2': None}

Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (2 total):
    * TensorSpec(shape=(1,), dtype=tf.string, name='inputs')
    * TensorSpec(shape=(1, 1024), dtype=tf.float32, name='states')
  Keyword arguments: {}

Option 2:
  Positional arguments (2 total):
    * TensorSpec(shape=(1,), dtype=tf.string, name='inputs')
    * None
  Keyword arguments: {}

In [11]:
import tensorflow as tf

model = tf.keras.models.load_model('model/model')

In [14]:
one_step_reloaded = tf.saved_model.load('model/one_step_model')

states = None
next_char = tf.constant(['pan'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8").replace('\r\n', " "))

pan prezes jeszcze raz tu stosujemy . to świńskie , bo może itd . , przystąpić do drugiego , ale tam będą chwilę pokazują warunki , w tym wysokim produktywnym wyspecjalizowanym , z punktu widzenia prognozy 550 zł , ale postępowania – przed tym g-uraybnem przez ministerstwo konstytucyjne , oczywiście różnice , prawnie z odrzuceniami zadające po to i kładzie nauczycielom – bo to jest kraj ujścionych : „ widzami się tu , a rolnicy zmieniają się szyk , którego jest to wartość rolnikami . tych zawarty . ona jest obejścia , g , czyli przez firmę – która budziła sformułowania świadczeń miast polskiego trybunału konstytucyjnego . dlaczego chciał by m , tak jak powiedział em w wymiarze pragnie , do rozmowy . kto jest za ? kto jest przeciw ? kto się wstrzymał ? stwierdzam , że głosowało 97 senatorów , wszyscy byli za . głosowanie nr 60 poprawka została przyjęta . poprawka nr 62 doprowadziła do tego , żeby nie zmienić projektu komisji . bankowy plan zagrożenia ? dziękuję . dziękuję .
