In [101]:
import datetime
import os

import constants

from azure.ai.ml import command, Input, Output
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential



In [102]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=constants.SUBSCRIPTION_ID,
    resource_group_name=constants.RESOURCE_GROUP_NAME,
    workspace_name=constants.WORKSPACE_NAME,
)

In [103]:
%%writefile components/prep.py 

from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from pathlib import Path
import datetime
import json

credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
secret_client = SecretClient(vault_url="https://mlgroup.vault.azure.net/", credential=credential)

import argparse

parser = argparse.ArgumentParser("prep")
parser.add_argument("--blob_storage", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--prep_output")
args = parser.parse_args()

# log in to the Blob Service Client
blob_storage = args.blob_storage
blob_storage_key = secret_client.get_secret("blob-storage-key")
blob_service_client = BlobServiceClient(blob_storage, account_key=blob_storage_key.value)

# connect to the container 
container_client = blob_service_client.get_container_client(container="stock-news-json") 

# list and download all currently available blobs
blob_list = container_client.list_blobs()
print(f"Blob from: {blob_storage} has these blobs today: {blob_list}")

# get the timestamp with the current day 
current_day_timestamp = datetime.datetime.today().timestamp()
current_day_timestamp = str(current_day_timestamp)[:5] # first 8 digits are the timestamp of the day

blobs_to_use = [blob.name for blob in blob_list if current_day_timestamp in blob.name]
for blob in blobs_to_use:
      print(f"Downloading blob: {blob}")
      blob_client = blob_service_client.get_blob_client(container="stock-news-json", blob=blob)
      with open(blob, mode="wb") as sample_blob:
            download_stream = blob_client.download_blob()
            sample_blob.write(download_stream.readall())

all_data_dict = {}
for json_file in blobs_to_use:
      with open(json_file,"r+") as file:
      # First we load existing data into a dict.
            file_data = json.load(file)
            all_data_dict.update(file_data)
            
with open((Path(args.prep_output) / "merged_stock_news.json"), "w") as file:
      file.write(json.dumps(all_data_dict, indent=4))

# this is a comments

Overwriting components/prep.py


In [104]:
%%writefile components/classify.py

from pathlib import Path
import argparse
import json
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification

parser = argparse.ArgumentParser()
parser.add_argument("--classify_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--classify_output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

# download distilbert model from HuggingFace
tokenizer = AutoTokenizer.from_pretrained("KernAI/stock-news-destilbert")
model = AutoModelForSequenceClassification.from_pretrained("KernAI/stock-news-destilbert")

# retriev the list of blobs from the current day - input is a .txt file
with open(os.path.join(args.classify_input, "merged_stock_news.json"), "r") as f:
      data = json.load(f)
texts = data["texts"]


sentiments = []
for text in texts: 
      tokenized_text = tokenizer(
            text,
            truncation=True,
            is_split_into_words=False,
            return_tensors="pt"
      )

      outputs = model(tokenized_text["input_ids"])
      outputs_logits = outputs.logits.argmax(1)

      mapping = {0: 'neutral', 1: 'negative', 2: 'positive'}
      predicted_label = mapping[int(outputs_logits[0])]
      sentiments.append(predicted_label)

# add the sentiments to the data
data["sentiments"] = sentiments

# overwrite old files with new files containing the sentiment
with open((Path(args.classify_output) / "merged_stock_news.json"), "w") as f:
      json.dump(data, f)

Overwriting components/classify.py


In [105]:
%%writefile components/summarize.py

from pathlib import Path
import argparse
import json
import os

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

parser = argparse.ArgumentParser()
parser.add_argument("--summarize_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--summarize_output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

# load the model and the tokenizer
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus") 

# retriev the list of blobs from the current day - input is a .txt file
with open(os.path.join(args.summarize_input, "merged_stock_news.json"), "r") as f:
      data = json.load(f)
texts = data["texts"]

summaries = []
for text in texts: 
      # Tokenize our text
      # If you want to run the code in Tensorflow, please remember to return the particular tensors as simply as using return_tensors = 'tf'
      input_ids = tokenizer(text, return_tensors="pt").input_ids

      # Generate the output (Here, we use beam search but you can also use any other strategy you like)
      output = model.generate(
            input_ids, 
            max_length=32, 
            num_beams=5, 
            early_stopping=True
      )

      # Finally, we can print the generated summary
      summaries.append(tokenizer.decode(output[0], skip_special_tokens=True))

# add the sentiments to the data
data["summaries"] = summaries

# overwrite old files with new files containing the sentiment
with open((Path(args.summarize_output) / "merged_stock_news.json"), "w") as f:
      json.dump(data, f)

Overwriting components/summarize.py


In [None]:
%%writefile components/notify.py

from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from pathlib import Path
import datetime
import json

credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
secret_client = SecretClient(vault_url="https://mlgroup.vault.azure.net/", credential=credential)

import argparse

parser = argparse.ArgumentParser("prep")
parser.add_argument("--notify_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--notify_output")
args = parser.parse_args()

# get the timestamp with the current day 
current_day_timestamp = datetime.datetime.today().timestamp()

with open(os.path.join(args.summarize_input, "merged_stock_news.json"), "r") as f:
      data = json.load(f)

# analyze the results of the sentiment and




data = json.dumps(article_info)

# connect and authenticate to the blob client
account_url = "https://mlstorageleo.blob.core.windows.net"
file_name = f"processed-stock-news-{current_day_timestamp}.json"

# Create the BlobServiceClient object
blob_storage_key = secret_client.get_secret("blob-storage-key")
blob_service_client = BlobServiceClient(account_url, credential=blob_storage_key.value)
blob_client = blob_service_client.get_blob_client(container="processed-stock-news-json", blob=file_name)
blob_client.upload_blob(data)

with open((Path(args.notify_output) / "merged_stock_news.json"), "w") as file:
      file.write(json.dumps(all_data_dict, indent=4))

In [8]:
import json 

with open("data/processed-example.json", "r") as f:
      data = json.load(f)

In [11]:
sentiments = data["sentiments"]
pos = sentiments.count("positive")
neu = sentiments.count("neutral")
neg = sentiments.count("negative")
ticker = data["ticker"][0]

In [14]:
sentiment_message = f"{ticker}: {pos} positive, {neu} neutral, {neg} negative"
print(sentiment_message)

TXT: 6 positive, 1 neutral, 0 negative


In [17]:
summaries = "\n".join(data["summaries"])
print(summaries)

Shares of Texas Instruments Incorporated (TXN) have been showing 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299 888-353-1299
Analysts have provided the following ratings for Texas Instruments (:TXN).
BBC Sport takes a look back at some of the more memorable moments from this year’s FA Cup.
Over the past 20 years, Texas Instruments has returned 11.85% on average.
Check out our latest analysis for Texas Instruments Return on Equity (ROE) weighs Texas Instruments' profit against the level of shareholders' equity.
Texas Instruments Incorporated (TXN) shares up by 0.48% in previous day’s close.
All photographs subject to copyright.


In [106]:
%%writefile dependencies/conda.yml
name: stock-analysis-env
channels:
  - conda-forge
dependencies:
  - python=3.9
  - pip
  - pip:
    - azure-storage-blob
    - azure-identity
    - azure-keyvault
    - transformers
    - torch
    - sentencepiece
    - numpy

Overwriting dependencies/conda.yml


In [107]:
custom_env_name = "stock-analysis-env"
version = "1.6"

try:    
    pipeline_job_env = ml_client.environments.get(custom_env_name, version=version)

except:
    pipeline_job_env = Environment(
        name=custom_env_name,
        description="Custom environment for stock analysis pipeline",
        conda_file=os.path.join("dependencies", "conda.yml"),
        image="mcr.microsoft.com/azureml/curated/python-sdk-v2:4",
        version=version,
    )
    pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

    print(
        f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
    )

In [108]:
data_type = AssetTypes.URI_FOLDER
path = "azureml://datastores/stocknewsjson/stock-news-json"
input_mode = InputOutputModes.RO_MOUNT
output_mode = InputOutputModes.RW_MOUNT

In [109]:
data_prep_component = command(
    name="data_prep",
    display_name="Finding out which blobs to actually use",
    description="Loads files from Azure Blob Storage from todays ",
    inputs={
        "blob_storage": Input(mode=InputOutputModes.DIRECT)
    },
    outputs={
        "prep_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/prep.py",
    command="python prep.py --blob_storage ${{inputs.blob_storage}} --prep_output ${{outputs.prep_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [110]:
classify_component = command(
    name="data_prep",
    display_name="Classify the sentiments of todays stock news",
    description="Loads data via AlphaVantage API input, preps data and stores to as data asset",
    inputs={
        "classify_input": Input(type=data_type, mode=input_mode), 
    },
    outputs={
        "classify_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/classify.py",
    command="python classify.py --classify_input ${{inputs.classify_input}} --classify_output ${{outputs.classify_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [111]:
summarize_component = command(
    name="data_prep",
    display_name="Summarize the news",
    description="Loads data via AlphaVantage API input, preps data and stores to as data asset",
    inputs={
        "summarize_input": Input(type=data_type, mode=input_mode),
    },
    outputs={
        "summarize_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/summarize.py",
    command="python summarize.py --summarize_input ${{inputs.summarize_input}} --summarize_output ${{outputs.summarize_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [112]:
from azure.ai.ml.dsl import pipeline

@pipeline(compute="ava")
def stock_news_pipeline():

    data_prep_job = data_prep_component(
        blob_storage="https://mlstorageleo.blob.core.windows.net/"
    )
    classify_job = classify_component(
        classify_input=data_prep_job.outputs.prep_output

    ) # feed putput of previous step into the training job
    summarize_job = summarize_component(
        summarize_input = classify_job.outputs.classify_output
    )

    return {"processed_file": summarize_job.outputs.summarize_output}

pipeline_job = stock_news_pipeline()

# set pipeline level compute
pipeline_job.settings.default_compute = "ava"
pipeline_job.settings.reuse_component = "false"

In [113]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="stock-news-analysis-pipeline"
)
pipeline_job

[32mUploading classify.py[32m (< 1 MB): 100%|##########| 1.53k/1.53k [00:00<00:00, 21.1kB/s]
[39m

[32mUploading summarize.py[32m (< 1 MB): 100%|##########| 1.82k/1.82k [00:00<00:00, 28.6kB/s]
[39m



Experiment,Name,Type,Status,Details Page
stock-news-analysis-pipeline,honest_tangelo_w4jjb1dppp,pipeline,Preparing,Link to Azure Machine Learning studio


In [114]:
# Wait until the job completes
ml_client.jobs.stream(pipeline_job.name)

RunId: honest_tangelo_w4jjb1dppp
Web View: https://ml.azure.com/runs/honest_tangelo_w4jjb1dppp?wsid=/subscriptions/5a361d37-b562-4eee-981b-0936493063e9/resourcegroups/mlgroup/workspaces/mlworkspace

Streaming logs/azureml/executionlogs.txt

[2023-07-09 19:37:12Z] Submitting 1 runs, first five are: 73d93c12:de61c408-4d35-42e4-ac58-dc7ea6500732
[2023-07-09 19:37:39Z] Completing processing run id de61c408-4d35-42e4-ac58-dc7ea6500732.
[2023-07-09 19:37:39Z] Submitting 1 runs, first five are: 27a9ddf8:3f0cb8bb-a1a6-4246-823e-6af3ff7ec8b9
[2023-07-09 19:38:29Z] Completing processing run id 3f0cb8bb-a1a6-4246-823e-6af3ff7ec8b9.
[2023-07-09 19:38:30Z] Submitting 1 runs, first five are: 9c985e6c:16a065ed-01f4-4b51-802b-34a16f7444d0
[2023-07-09 19:43:50Z] Completing processing run id 16a065ed-01f4-4b51-802b-34a16f7444d0.

Execution Summary
RunId: honest_tangelo_w4jjb1dppp
Web View: https://ml.azure.com/runs/honest_tangelo_w4jjb1dppp?wsid=/subscriptions/5a361d37-b562-4eee-981b-0936493063e9/resour