In [3]:
import os

from azure.ai.ml import command, Input, Output
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

In [4]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient.from_config(credential=credential)

Found the config file in: .\config.json


In [5]:
%%writefile components/prep.py 

from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from pathlib import Path
import datetime
import json

credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
secret_client = SecretClient(vault_url="https://mlgroup.vault.azure.net/", credential=credential)

import argparse

parser = argparse.ArgumentParser("prep")
parser.add_argument("--blob_storage", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--prep_output")
args = parser.parse_args()

# log in to the Blob Service Client
blob_storage = args.blob_storage
blob_storage_key = secret_client.get_secret("blob-storage-key")
blob_service_client = BlobServiceClient(blob_storage, account_key=blob_storage_key.value)

# connect to the container 
container_client = blob_service_client.get_container_client(container="stock-news-json") 

# list and download all currently available blobs
blob_list = container_client.list_blobs()
print(f"Blob from: {blob_storage} has these blobs today: {blob_list}")

# get the timestamp with the current day 
current_day_date = datetime.datetime.today().isoformat()[:10]

# filter out which blobs have the current date and download them
blobs_to_use = [blob.name for blob in blob_list if current_day_date in blob.name]
for blob in blobs_to_use:
      print(f"Downloading blob: {blob}")
      blob_client = blob_service_client.get_blob_client(container="stock-news-json", blob=blob)
      with open(blob, mode="wb") as sample_blob:
            download_stream = blob_client.download_blob()
            sample_blob.write(download_stream.readall())

# combine all blobs into one dictionary
all_data_dict = {}
for json_file in blobs_to_use:
      with open(json_file,"r+") as file:
      # First we load existing data into a dict.
            file_data = json.load(file)
            all_data_dict.update(file_data)

# pass aggregated file to the next step        
with open((Path(args.prep_output) / "merged_stock_news.json"), "w") as file:
      file.write(json.dumps(all_data_dict, indent=4))

Overwriting components/prep.py


In [6]:
%%writefile components/classify.py

from pathlib import Path
import argparse
import json
import os

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define constants for model names
DESTILBERT_MODEL_NAME = "KernAI/stock-news-destilbert"
FINBERT_MODEL_NAME = "ProsusAI/finbert"

# Define a dictionary to map model names to their tokenizers
MODEL_NAME_TO_TOKENIZER = {
    DESTILBERT_MODEL_NAME: AutoTokenizer,
    FINBERT_MODEL_NAME: AutoTokenizer,
}

# Define a dictionary to map model names to their models
MODEL_NAME_TO_MODEL = {
    DESTILBERT_MODEL_NAME: AutoModelForSequenceClassification,
    FINBERT_MODEL_NAME: AutoModelForSequenceClassification,
}

def download_model(model_name: str):
    model = MODEL_NAME_TO_MODEL[model_name].from_pretrained(model_name)
    tokenizer = MODEL_NAME_TO_TOKENIZER[model_name].from_pretrained(model_name)
    return model, tokenizer

def use_model(
    model, 
    tokenizer,
    text: str
    ) -> str:
    tokenized_text = tokenizer(
        text,
        truncation=True,
        is_split_into_words=False,
        return_tensors="pt"
    )

    outputs = model(**tokenized_text)
    outputs_logits = outputs.logits.argmax(1)

    if isinstance(model, transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification):
        mapping = {0: 'neutral', 1: 'negative', 2: 'positive'} # distilbert mapping
    else:
        mapping = {0: 'positive', 1: 'negative', 2: 'neutral'} # finbert mapping

    return mapping[int(outputs_logits[0])]

# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--classify_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--classify_output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

# Download models
destilbert_model, destilbert_tokenizer = download_model(DESTILBERT_MODEL_NAME)
finbert_model, finbert_tokenizer = download_model(FINBERT_MODEL_NAME)

# Read input data
input_file_path = os.path.join(args.classify_input, "merged_stock_news.json")
with open(input_file_path, "r") as f:
    data = json.load(f)

# Iterate through tickers
for ticker, ticker_data in data.items():
    texts = ticker_data["texts"]

    # Use the models and append sentiments
    sentiments_distilbert = []
    for text in texts:
        if len(text) == 0:
            pass
        else:
            sentiment = use_model(destilbert_model, destilbert_tokenizer, text)
            sentiments_distilbert.append(sentiment)

    # Update the data with sentiments
    ticker_data["sentiments"] = sentiments_distilbert

# Write the updated data back to the output file
output_file_path = Path(args.classify_output) / "merged_stock_news.json"
with open(output_file_path, "w") as f:
    json.dump(data, f)

Overwriting components/classify.py


In [7]:
text = "Hi"

In [8]:
%%writefile components/summarize.py

from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

from pathlib import Path
import datetime 
import argparse
import json
import os
from openai import OpenAI

credential = DefaultAzureCredential()
# check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
secret_client = SecretClient(vault_url="https://mlgroup.vault.azure.net/", credential=credential)

parser = argparse.ArgumentParser()
parser.add_argument("--summarize_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--summarize_output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()


# retriev the list of blobs from the current day - input is a .txt file
with open(os.path.join(args.summarize_input, "merged_stock_news.json"), "r") as f:
      data = json.load(f)

# authenticate to openai
api_key = api_key=secret_client.get_secret("openai-key")
openai_client = OpenAI(api_key=api_key.value)

# get a list of all tickers, summaries all texts for each ticker
tickers = list(data.keys())
for ticker in tickers:
    texts = data[ticker]["texts"]

    summaries = []
    for text in texts: 
        if len(text) == 0:
            pass
        else:
            response = openai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                    {"role": "system", "content": f"""
                        As an assistant, your task is to summarize stock and finance news. 
                        Your summary should be a single sentence that rephrases the key information of the article. 
                        When in doubt, leave information out. The summary should be very short.
                        Be sure to include specific numbers such as stock price changes or concrete earning figures. 
                        Aim for brevity and precision in your summary.
                        =========
                        Article: {text}
                        =========
                        Summary: 
                        """}
                ],
                max_tokens=60, 
                temperature=0.0
            )

            summaries.append(response.choices[0].message.content)

    # add the sentiments to the data
    data[ticker]["summaries"] = summaries

# connect and authenticate to the blob client
account_url = "https://mlstorageleo.blob.core.windows.net"
file_name = f"processed-stock-news-{datetime.datetime.today().isoformat()[:10]}.json"

# create the BlobServiceClient object
blob_data = json.dumps(data)
blob_storage_key = secret_client.get_secret("blob-storage-key")
blob_service_client = BlobServiceClient(account_url, credential=blob_storage_key.value)
blob_client = blob_service_client.get_blob_client(container="processed-stock-news-json", blob=file_name)
blob_client.upload_blob(blob_data)

# overwrite old files with new files containing the sentiment
with open((Path(args.summarize_output) / "merged_stock_news.json"), "w") as f:
      json.dump(data, f)

Overwriting components/summarize.py


In [9]:
%%writefile components/notify.py

from pathlib import Path
import datetime 
import argparse
import json
import os

from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from azure.communication.email import EmailClient

# Define constants for the email subject and sender address
EMAIL_SUBJECT = f"Stock news analysis for {datetime.datetime.today().isoformat()[:10]}"
SENDER_ADDRESS = "DoNotReply@632a8f5c-5cc8-4c44-8e7e-f509c76d0d24.azurecomm.net"
RECIPIENT_ADDRESS = "leopuettmann@outlook.de"

parser = argparse.ArgumentParser()
parser.add_argument("--notify_input", type=str, help="Mounted Azure ML blob storage")
parser.add_argument("--notify_output", type=str, help="Mounted Azure ML blob storage")
args = parser.parse_args()

with open(os.path.join(args.notify_input, "merged_stock_news.json"), "r") as f:
    data = json.load(f)

# print(data)
# print(type(data))

def format_data(ticker):
    global data
    summaries = " ".join(data[ticker]["summaries"])
    sentiments = (
        data[ticker]["sentiments"].count("positive"),
        data[ticker]["sentiments"].count("neutral"),
        data[ticker]["sentiments"].count("negative"),
    )
    texts = " ".join(data[ticker]["url"])
    return summaries, sentiments, texts

msft_summaries, msft_sentiments, msft_texts = format_data("MSFT")
aapl_summaries, aapl_sentiments, aapl_texts = format_data("AAPL")
txn_summaries, txn_sentiments, txn_texts = format_data("TXN")

email_content = f"""
This is your daily stock news summary. 

===
News about Microsoft: 
{msft_summaries}
\n Sentiments: positive -> {msft_sentiments[0]} | neutral -> {msft_sentiments[1]} | negative -> {msft_sentiments[2]}
=== \n\n

===
News about Apple: 
{aapl_summaries}
\n Sentiments: positive -> {aapl_sentiments[0]} | neutral -> {aapl_sentiments[1]} | negative -> {aapl_sentiments[2]}
=== \n\n

===
News about Texas Instruments: 
{txn_summaries}
\n Sentiments: positive -> {txn_sentiments[0]} | neutral -> {txn_sentiments[1]} | negative -> {txn_sentiments[2]}
===
"""

def send_email(email_client, subject, content, recipient, sender):
    message = {
        "content": {
            "subject": subject,
            "plainText": content
        },
        "recipients": {
            "to": [
                {
                    "address": recipient,
                    "displayName": "Leo"
                }
            ]
        },
        "senderAddress": sender
    }
    poller = email_client.begin_send(message)

# Initialize Azure services and clients
credential = DefaultAzureCredential()
credential.get_token("https://management.azure.com/.default")
secret_client = SecretClient(vault_url="https://mlgroup.vault.azure.net/", credential=credential)
connection_string = secret_client.get_secret("mail-connection-string")
email_client = EmailClient.from_connection_string(connection_string.value)

# Send email
send_email(email_client, EMAIL_SUBJECT, email_content, RECIPIENT_ADDRESS, SENDER_ADDRESS)

# Pass merged stock news JSON file to the output of the pipeline
with open((Path(args.notify_output) / "merged_stock_news.json"), "w") as f:
    json.dump(data, f)

Overwriting components/notify.py


In [10]:
%%writefile dependencies/conda.yml
name: stock-analysis-env
channels:
  - conda-forge
dependencies:
  - python=3.9
  - pip
  - pip:
    - azure-storage-blob
    - azure-identity
    - azure-keyvault
    - azure-communication-email
    - transformers
    - torch
    - sentencepiece
    - numpy
    - openai

Overwriting dependencies/conda.yml


In [11]:
# info for the env 
custom_env_name = "stock-analysis-env"
version = "1.8"

try:    
    pipeline_job_env = ml_client.environments.get(custom_env_name, version=version)

except:
    pipeline_job_env = Environment(
        name=custom_env_name,
        description="Custom environment for stock analysis pipeline",
        conda_file=os.path.join("dependencies", "conda.yml"),
        image="mcr.microsoft.com/azureml/curated/python-sdk-v2:4",
        version=version,
    )
    pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

    print(
        f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
    )

In [12]:
# variables for the compontents
data_type = AssetTypes.URI_FOLDER
path = "azureml://datastores/stocknewsjson/stock-news-json"
input_mode = InputOutputModes.RO_MOUNT
output_mode = InputOutputModes.RW_MOUNT

In [13]:
data_prep_component = command(
    name="data_prep",
    display_name="Finding out which blobs to actually use",
    description="Loads files from Azure Blob Storage from todays ",
    inputs={
        "blob_storage": Input(mode=InputOutputModes.DIRECT)
    },
    outputs={
        "prep_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/prep.py",
    command="python prep.py --blob_storage ${{inputs.blob_storage}} --prep_output ${{outputs.prep_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [14]:
classify_component = command(
    name="classify",
    display_name="Classify the sentiments of todays stock news",
    description="Loads data via AlphaVantage API input, preps data and stores to as data asset",
    inputs={
        "classify_input": Input(type=data_type, mode=input_mode), 
    },
    outputs={
        "classify_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/classify.py",
    command="python classify.py --classify_input ${{inputs.classify_input}} --classify_output ${{outputs.classify_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [15]:
summarize_component = command(
    name="summarize",
    display_name="Summarize the news",
    description="Uses a pegasus model to summarize the news aricle",
    inputs={
        "summarize_input": Input(type=data_type, mode=input_mode),
    },
    outputs={
        "summarize_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/summarize.py",
    command="python summarize.py --summarize_input ${{inputs.summarize_input}} --summarize_output ${{outputs.summarize_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [16]:
notify_component = command(
    name="notify",
    display_name="Notify the user via Mail",
    description="Sends out an E-Mail with the results of the pipeline",
    inputs={
        "notify_input": Input(type=data_type, mode=input_mode),
    },
    outputs={
        "notify_output": Output(type=data_type, mode=output_mode)
    },
    code="./components/notify.py",
    command="python notify.py --notify_input ${{inputs.notify_input}} --notify_output ${{outputs.notify_output}}",
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    compute="ava",
    is_deterministic="false"
)

In [17]:
from azure.ai.ml.dsl import pipeline

@pipeline(compute="ava")
def stock_news_pipeline():

    data_prep_job = data_prep_component(
        blob_storage="https://mlstorageleo.blob.core.windows.net/"
    )
    classify_job = classify_component(
        classify_input=data_prep_job.outputs.prep_output

    ) 
    summarize_job = summarize_component(
        summarize_input = classify_job.outputs.classify_output
    )

    notify_job = notify_component(
        notify_input = summarize_job.outputs.summarize_output
    )

    return {"processed_file": notify_job.outputs.notify_output}

pipeline_job = stock_news_pipeline()

# set pipeline level compute
pipeline_job.settings.default_compute = "ava"
pipeline_job.settings.reuse_component = "false" 

In [18]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="stock-news-analysis-pipeline"
)
pipeline_job

[32mUploading classify.py[32m (< 1 MB): 100%|##########| 2.87k/2.87k [00:00<00:00, 52.3kB/s]
[39m

[32mUploading summarize.py[32m (< 1 MB): 100%|##########| 3.20k/3.20k [00:00<00:00, 42.8kB/s]
[39m



Experiment,Name,Type,Status,Details Page
stock-news-analysis-pipeline,sleepy_plane_slc80jknmh,pipeline,Preparing,Link to Azure Machine Learning studio


In [19]:
# Wait until the job completes
ml_client.jobs.stream(pipeline_job.name)

RunId: sleepy_plane_slc80jknmh
Web View: https://ml.azure.com/runs/sleepy_plane_slc80jknmh?wsid=/subscriptions/5a361d37-b562-4eee-981b-0936493063e9/resourcegroups/MlGroup/workspaces/mlworkspace

Streaming logs/azureml/executionlogs.txt

[2023-12-28 21:39:17Z] Submitting 1 runs, first five are: dc5db119:577cc570-d1e5-4b1e-9b58-13ec1fd12d60
[2023-12-28 21:40:10Z] Completing processing run id 577cc570-d1e5-4b1e-9b58-13ec1fd12d60.
[2023-12-28 21:40:10Z] Submitting 1 runs, first five are: 6589ea02:db11ad71-4974-4af0-9f89-6c5182c84604
[2023-12-28 21:41:33Z] Completing processing run id db11ad71-4974-4af0-9f89-6c5182c84604.
[2023-12-28 21:41:33Z] Submitting 1 runs, first five are: e95b4aef:099c6745-5b22-4899-b586-a4974299edc9
[2023-12-28 21:43:09Z] Completing processing run id 099c6745-5b22-4899-b586-a4974299edc9.
[2023-12-28 21:43:09Z] Submitting 1 runs, first five are: 963af142:6c2cc11b-1145-4448-935e-0be557861013
[2023-12-28 21:43:55Z] Completing processing run id 6c2cc11b-1145-4448-935e-0