In [1]:
import datetime
import os

import pandas as pd 

import constants

from azure.ai.ml import command, Input, Output
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential



In [2]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=constants.SUBSCRIPTION_ID,
    resource_group_name=constants.RESOURCE_GROUP_NAME,
    workspace_name=constants.WORKSPACE_NAME,
)

In [19]:
current_day_timestamp = datetime.datetime.today().timestamp()
current_day_timestamp = str(current_day_timestamp)[:8]

16873668


In [2]:
%%writefile files/prep.py 

from azure.storage.blob import BlobServiceClient

import argparse

def main():

    parser = argparse.ArgumentParser("prep")
    parser.add_argument("--prep_data", type=str, help="Path of prepped data")
    args = parser.parse_args()

    # log in to the Blob Service Client
    account_url = "https://mlstorageleo.blob.core.windows.net"
    blob_service_client = BlobServiceClient(account_url, account_key=constants.BLOB_KEY)

    # connect to the container 
    container_client = blob_service_client.get_container_client(container="stock-news-json") 

    # list and download all currently available blobs
    blob_list = container_client.list_blobs()

    # get the timestamp with the current day 
    current_day_timestamp = datetime.datetime.today().timestamp()
    current_day_timestamp = str(current_day_timestamp)[:8] # first 8 digits are the timestamp of the day

    blobs_to_download = [blob.name for blob in blob_list if current_day_timestamp in blob.name]
    for blob in blobs_to_download:
        download_file_path = os.path.join(args.prep_data, str(blob))
        with open(file=download_file_path, mode="wb") as download_file:
            download_file.write(container_client.download_blob(blob).readall())

if __name__ == "__main__":
    main()

Overwriting files/prep.py


In [2]:
#%%writefile files/classify.py

import argparse
import json
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# parser = argparse.ArgumentParser()
# parser.add_argument("--input_data", type=str, help="path or URL to input data")
# parser.add_argument("--output_data", type=str, help="path or URL to output data")
# args = parser.parse_args()

# download distilbert model from HuggingFace
tokenizer = AutoTokenizer.from_pretrained("KernAI/stock-news-destilbert")
model = AutoModelForSequenceClassification.from_pretrained("KernAI/stock-news-destilbert")

def main():
      #dir_list = os.listdir(args.input_data)
      dir_list = "./data/"
      for file_name in [file for file in os.listdir(dir_list) if file.endswith('.json')]:
            with open(dir_list + file_name) as json_file:
                  data = json.load(json_file)
            texts = data["texts"]

            sentiments = []
            for text in texts: 
                  tokenized_text = tokenizer(
                        text,
                        truncation=True,
                        is_split_into_words=False,
                        return_tensors="pt"
                  )

                  outputs = model(tokenized_text["input_ids"])
                  outputs_logits = outputs.logits.argmax(1)

                  mapping = {0: 'neutral', 1: 'negative', 2: 'positive'}
                  predicted_label = mapping[int(outputs_logits[0])]
                  sentiments.append(predicted_label)

            # add the sentiments to the data
            data["sentiments"] = sentiments

            # overwrite old files with new files containing the sentiment
            with open(dir_list+file_name, "w") as f:
                  json.dump(data, f)

if __name__ == "__main__":
      main()


In [None]:
#%%writefile files/summarize.py

import argparse
import json
import os

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

# parser = argparse.ArgumentParser()
# parser.add_argument("--input_data", type=str, help="path or URL to input data")
# parser.add_argument("--output_data", type=str, help="path or URL to output data")
# args = parser.parse_args()

# load the model and the tokenizer
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus") 

def main():
      #dir_list = os.listdir(args.input_data)
      dir_list = "./data/"
      for file_name in [file for file in os.listdir(dir_list) if file.endswith('.json')]:
            with open(dir_list + file_name) as json_file:
                  data = json.load(json_file)
            texts = data["texts"]

            summaries = []
            for text in texts: 
                # Tokenize our text
                # If you want to run the code in Tensorflow, please remember to return the particular tensors as simply as using return_tensors = 'tf'
                input_ids = tokenizer(text, return_tensors="pt").input_ids

                # Generate the output (Here, we use beam search but you can also use any other strategy you like)
                output = model.generate(
                    input_ids, 
                    max_length=32, 
                    num_beams=5, 
                    early_stopping=True
                )

                # Finally, we can print the generated summary
                summaries.append(tokenizer.decode(output[0], skip_special_tokens=True))

            # add the sentiments to the data
            data["summaries"] = summaries

            # overwrite old files with new files containing the sentiment
            with open(dir_list+file_name, "w") as f:
                  json.dump(data, f)

if __name__ == "__main__":
      main()


In [5]:
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus") 

def main():
    #dir_list = os.listdir(args.input_data)
    dir_list = "./data/"
    for file_name in [file for file in os.listdir(dir_list) if file.endswith('.json')]:
        with open(dir_list + file_name) as json_file:
                data = json.load(json_file)
        texts = data["texts"]

        input_ids = tokenizer(texts[0], return_tensors="pt").input_ids

        # Generate the output (Here, we use beam search but you can also use any other strategy you like)
        output = model.generate(
            input_ids, 
            max_length=32, 
            num_beams=5, 
            early_stopping=True
        )

        # Finally, we can print the generated summary
        print("Original text:")
        print(texts[0]+"\n")
        print("Summarized text:")
        print(tokenizer.decode(output[0], skip_special_tokens=True)+ "\n")

if __name__ == "__main__":
      main()

Original text:
In the past week, AAPL stock has gone up by 0.01%, with a monthly gain of 4.27% and a quarterly surge of 21.86%. The volatility ratio for the week is 2.08%, and the volatility levels for the last 30 days are 1.48% for Apple Inc. The simple moving average for the last 20 days is 3.05% for AAPL stock, with a simple moving average of 18.58% for the last 200 days. Is It Worth Investing in Apple Inc. (NASDAQ: AAPL) Right Now? Apple Inc. (NASDAQ: AAPL) has a price-to-earnings ratio of 30.74x that is above its average ratio. Additionally, the

Summarized text:
Shares of Apple are trading at a P/E ratio of 30.74x.

Original text:
What Happened: Shares of fabless chip and software maker Broadcom (NASDAQ:AVGO) jumped 5.5% in the afternoon session after the company is expected to receive conditional approval from the European Union for its $61 billion acquisition of VMware. This approval is subject to Broadcom offering solutions to address antitrust concerns. The news comes amidst 

In [None]:





# Some text to summarize here
text_to_summarize = "National Commercial Bank (NCB), Saudi Arabia’s largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba’s Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region’s third-largest lender. The entity’s $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East’s biggest lender with about $268 billion of assets."


# Generated Output: Saudi bank to pay a 3.5% premium to Samba share price. Gulf region’s third-largest lender will have total assets of $220 billion


In [None]:
%%writefile files/store.py