In [1]:
import datetime
import os

import constants

from azure.ai.ml import command, Input, Output
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential



In [2]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=constants.SUBSCRIPTION_ID,
    resource_group_name=constants.RESOURCE_GROUP_NAME,
    workspace_name=constants.WORKSPACE_NAME,
)

In [56]:
%%writefile components/prep.py 

from azure.storage.blob import BlobServiceClient

import argparse

parser = argparse.ArgumentParser("prep")
parser.add_argument("--blob_storage_read", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--account_url")
args = parser.parse_args()

# log in to the Blob Service Client
account_url = args.account_url
blob_service_client = BlobServiceClient(account_url, account_key=constants.BLOB_KEY)

# connect to the container 
container_client = blob_service_client.get_container_client(container="stock-news-json") 

# list and download all currently available blobs
blob_list = container_client.list_blobs()

# get the timestamp with the current day 
current_day_timestamp = datetime.datetime.today().timestamp()
current_day_timestamp = str(current_day_timestamp)[:8] # first 8 digits are the timestamp of the day

blobs_to_use = [blob.name for blob in blob_list if current_day_timestamp in blob.name]

# # ! the files should not be downloaded in this step. Instead it might make more sense to pass a list with the filenames to the next component
# with open(args.blob_storage+"/blobs_to_use.txt", "w") as f:
#     f.write("\n".join(blob for blob in blobs_to_use), f)

(Path(args.score_output) / "blobs_to_use.txt").write_text("\n".join(blob for blob in blobs_to_use))
# continue here with this example -> https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/pipelines/1a_pipeline_with_components_from_yaml/score_src/score.py

Overwriting components/prep.py


In [57]:
%%writefile components/classify.py

from pathlib import Path
import argparse
import json
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification

parser = argparse.ArgumentParser()
parser.add_argument("--blob_storage_read", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--blobs_to_use_output", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--blob_storage_write", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

# download distilbert model from HuggingFace
tokenizer = AutoTokenizer.from_pretrained("KernAI/stock-news-destilbert")
model = AutoModelForSequenceClassification.from_pretrained("KernAI/stock-news-destilbert")

# retriev the list of blobs from the current day - input is a .txt file
with open(args.blob_storage, "r") as f:
      blobs_to_use = f.read()


dir_list = args.folder_path
for file_name in [file for file in os.listdir(dir_list) if file in blobs_to_use]:
      with open(dir_list + file_name) as json_file:
            data = json.load(json_file)
      texts = data["texts"]

      sentiments = []
      for text in texts: 
            tokenized_text = tokenizer(
                  text,
                  truncation=True,
                  is_split_into_words=False,
                  return_tensors="pt"
            )

            outputs = model(tokenized_text["input_ids"])
            outputs_logits = outputs.logits.argmax(1)

            mapping = {0: 'neutral', 1: 'negative', 2: 'positive'}
            predicted_label = mapping[int(outputs_logits[0])]
            sentiments.append(predicted_label)

      # add the sentiments to the data
      data["sentiments"] = sentiments

      # overwrite old files with new files containing the sentiment
      with open(dir_list+file_name, "w") as f:
            json.dump(data, f)

Overwriting components/classify.py


In [58]:
%%writefile components/summarize.py

from pathlib import Path
import argparse
import json
import os

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

# load the model and the tokenizer
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus") 

dir_list = args.blob_storage
for file_name in [file for file in os.listdir(dir_list) if file.endswith('.json')]:
      with open(dir_list + file_name) as json_file:
            data = json.load(json_file)
      texts = data["texts"]

      summaries = []
      for text in texts: 
            # Tokenize our text
            # If you want to run the code in Tensorflow, please remember to return the particular tensors as simply as using return_tensors = 'tf'
            input_ids = tokenizer(text, return_tensors="pt").input_ids

            # Generate the output (Here, we use beam search but you can also use any other strategy you like)
            output = model.generate(
                  input_ids, 
                  max_length=32, 
                  num_beams=5, 
                  early_stopping=True
            )

            # Finally, we can print the generated summary
            summaries.append(tokenizer.decode(output[0], skip_special_tokens=True))

      # add the sentiments to the data
      data["summaries"] = summaries

      # overwrite old files with new files containing the sentiment
      with open(dir_list+file_name, "w") as f:
            json.dump(data, f)

(Path())

Overwriting components/summarize.py


In [59]:
%%writefile dependencies/conda.yml
name: stock-analysis-env
channels:
  - conda-forge
dependencies:
  - python=3.9
  - pip
  - pip:
    - azure-storage
    - transformers
    - sentencepiece
    - numpy
    - json

Overwriting dependencies/conda.yml


In [60]:
custom_env_name = "stock-analysis-env"

try:    
    pipeline_job_env = ml_client.environments.get(custom_env_name, version="1.1")

except:
    pipeline_job_env = Environment(
        name=custom_env_name,
        description="Custom environment for stock analysis pipeline",
        conda_file=os.path.join("dependencies", "conda.yml"),
        image="mcr.microsoft.com/azureml/curated/python-sdk-v2:4",
        version="1.1",
    )
    pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

    print(
        f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
    )

In [61]:
data_type = AssetTypes.URI_FOLDER
mode = InputOutputModes.RW_MOUNT
path = "azureml://datastores/stocknewsjson/stock-news-json"

In [62]:
data_prep_component = command(
    name="data_prep",
    display_name="Finding out which blobs to actually use",
    description="Loads files from Azure Blob Storage from todays ",
    inputs={
        "blob_storage_read": Input(type=data_type, mode=InputOutputModes.RO_MOUNT, path=path),
        "account_url": Input(mode=InputOutputModes.DIRECT)
    },
    outputs={
        "blobs_to_use_output": Output(type=data_type, mode=InputOutputModes.RW_MOUNT, path=path)
    },
    code="./components/prep.py",
    command="python prep.py --blob_storage ${{inputs.blob_storage}} --blobs_to_use_output ${{outputs.blobs_to_use_output}}, --account_url ${{inputs.account_url}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava"
)

In [66]:
classify_component = command(
    name="data_prep",
    display_name="Classify the sentiments of todays stock news",
    description="Loads data via AlphaVantage API input, preps data and stores to as data asset",
    inputs={
        "blob_storage_read": Input(type=data_type, mode=InputOutputModes.RO_MOUNT, path=path), 
        "blobs_to_use_input": Input(type=AssetTypes.URI_FILE),
    },
    outputs={
        "blob_storage_write": Output(type=data_type, mode=InputOutputModes.RW_MOUNT, path=path)
    },
    code="./components/classify.py",
    command="python classify.py --blob_storage_read ${{inputs.blob_storage_read}} --blobs_to_use_input ${{inputs.blobs_to_use_input}} --blob_storage_write ${{outputs.blob_storage_write}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava"
)

In [67]:
summarize_component = command(
    name="data_prep",
    display_name="Summarize the news",
    description="Loads data via AlphaVantage API input, preps data and stores to as data asset",
    inputs={
        "blob_storage_read": Input(type=data_type, mode=InputOutputModes.RO_MOUNT, path=path),
        "blobs_to_use_input": Input(type=AssetTypes.URI_FILE)
    },
    outputs={
        "blob_storage_write": Output(type=data_type, mode=InputOutputModes.RW_MOUNT, path=path)
    },
    code="./components/summarize.py",
    command="python summarize.py --blob_storage_read ${{inputs.blob_storage_read}} --blob_storage_write ${{outputs.blob_storage_write}} --blobs_to_use_input ${{inputs.blobs_to_use_input}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava"
)

In [68]:
from azure.ai.ml.dsl import pipeline

@pipeline(compute="ava")
def pipeline_with_python_function_components(account_url):

    data_prep_job = data_prep_component(
        account_url=account_url
    )
    classify_job = classify_component(
        blobs_to_use_input=data_prep_job.outputs.blobs_to_use_output,
        blob_storage_read=path, 
        blob_storage_write=path
    ) # feed putput of previous step into the training job
    summarize_job = summarize_component(
        blob_storage_read=path, 
        blob_storage_write=path
    )

    return {"blobs_processed": data_prep_job.outputs.blobs_to_use_output}

pipeline_job = pipeline_with_python_function_components(
    account_url=Input(
        path="https://mlstorageleo.blob.core.windows.net"
    )
)

# set pipeline level compute
pipeline_job.settings.default_compute = "ava"

UnexpectedKeywordError: [component] Classify the sentiments of todays stock news() got an unexpected keyword argument 'blob_storage_write', valid keywords: 'blob_storage_read', 'blobs_to_use_input'.

In [54]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="stock-news-analysis-pipeline"
)
pipeline_job

Exception: 
[37m
[30m
1) At least one required parameter is missing[39m[39m

Details: 

[31m(x) Input path can't be empty for jobs.[39m

Resolutions: 
1) Ensure all parameters required by the Job schema are specified.
If using the CLI, you can also check the full log in debug mode for more details by adding --debug to the end of your command

Additional Resources: The easiest way to author a yaml specification file is using IntelliSense and auto-completion Azure ML VS code extension provides: [36mhttps://code.visualstudio.com/docs/datascience/azure-machine-learning.[39m To set up VS Code, visit [36mhttps://docs.microsoft.com/azure/machine-learning/how-to-setup-vs-code[39m


In [14]:
# Wait until the job completes
ml_client.jobs.stream(pipeline_job.name)

AttributeError: 'function' object has no attribute 'name'