In [1]:
# !pip install transformers
# !pip install sentencepiece
# !pip3 install torch torchvision

# 1. Import necessary libraries

In [2]:
from bs4 import BeautifulSoup
import requests
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# 2. Setup Summarization Model

In [3]:
# Let's load the model and the tokenizer 
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name) # If you want to use the Tensorflow model 

# 3. Summarize a Single Article

In [4]:
# Load the webpage
url = "https://finance.yahoo.com/news/michael-burry-bets-heavily-against-205244988.html"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser") # Convert into a bs4 object
paragraphs = soup.find_all("p")

In [5]:
# Getting the text and storing it infront of a list
text = []
for paragraph in paragraphs:
    text.append(paragraph.get_text())

words = " ".join(text).strip().split(" ")[:512] # Combining the text from the list into a single string

ARTICLE = " ".join(words)[:512].strip()

In [6]:
# Some text to summarize here
text_to_summarize = ARTICLE

# Tokenize our text
# If you want to run the code in Tensorflow, please remember to return the particular tensors as simply as using return_tensors = 'tf'
input_ids = tokenizer(text_to_summarize, return_tensors="pt").input_ids

# Generate the output (Here, we use beam search but you can also use any other strategy you like)
output = model.generate(
    input_ids, 
    max_length=32, 
    num_beams=5, 
    early_stopping=True
)

# Finally, we can print the generated summary
summary = tokenizer.decode(output[0], skip_special_tokens=True)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /Users/distiller/project/conda/conda-bld/pytorch_1623459044803/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [7]:
summary

'Burry’s Scion Asset Management now owns 1.1 million put options.'

# 4. Building a News and Sentiment Pipeline

In [8]:
monitored_tickers = ["MNMD", "GME", "TSLA"]

# 4.1 Search for Stock News using Google and Yahoo Finance

In [9]:
def search_for_stock_news_urls(ticker):
    # Load the webpage
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&rlz&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.content, "html.parser") # Convert into a bs4 object
    atags = soup.find_all("a") # Getting the links to the yahoo articles

    # Creating an empty list and storing those links into a list
    hrefs = []
    for link in atags:
        hrefs.append(link.get("href"))
    return hrefs

In [10]:
raw_url = {}
for ticker in monitored_tickers:
    raw_url[ticker] = search_for_stock_news_urls(ticker)

# 4.2 Strip out unwanted URLs

In [11]:
import re

In [12]:
excluded_list = ["maps", "support", "accounts", "policies", "preferences"] # if the link contains these words we want to take them out

In [13]:
def strip_unwanted_urls(urls, excluded_list):
    val = []
    for url in urls:
        if "https://" in url and not any(exclude_word in url for exclude_word in excluded_list):
            res = re.findall(r"(https?://\S+)", url)[0].split("&")[0]
            val.append(res)
    return list(set(val))

In [14]:

cleaned_url = {}
for ticker in monitored_tickers:
    cleaned_url[ticker] = strip_unwanted_urls(raw_url[ticker], excluded_list)

cleaned_url

{'MNMD': ['https://finance.yahoo.com/news/mindmed-commence-trading-nasdaq-113000089.html',
  'https://finance.yahoo.com/news/mindmed-begins-trading-nasdaq-113000860.html',
  'https://finance.yahoo.com/news/psyched-field-trip-uplists-nasdaq-140833357.html',
  'https://finance.yahoo.com/news/mindmed-announces-2021-q2-financial-035000827.html',
  'https://finance.yahoo.com/news/psychedelic-drugmaker-mind-med-stumbles-in-nasdaq-debut-ceo-says-still-very-early-innings-104340252.html',
  'https://finance.yahoo.com/news/mindmed-appoints-dr-robert-dworkin-113000291.html',
  'https://finance.yahoo.com/news/mindmed-joins-critical-path-institutes-113000322.html',
  'https://finance.yahoo.com/news/mindmed-joins-digital-medicine-society-113000450.html',
  'https://finance.yahoo.com/news/field-trip-health-heads-nasdaq-210415485.html',
  'https://finance.yahoo.com/news/mindmed-announces-partnership-datavant-leading-122000231.html'],
 'GME': ['https://finance.yahoo.com/news/case-stock-market-bubble-sp

# 4.3 Search and Scrape Cleaned URLs

In [15]:
def scrape_and_process(URLS):
    ARTICLES = []
    for url in URLS:
        # Load the webpage
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser") # Convert into a bs4 object
        paragraphs = soup.find_all("p")

        # Getting the text and storing it infront of a list
        text = []
        for paragraph in paragraphs:
            text.append(paragraph.get_text())

        words = " ".join(text).strip().split(" ")[:400] # Combining the text from the list into a single string

        ARTICLE = " ".join(words)[:400].strip()
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [16]:
articles = {}
for ticker in monitored_tickers:
    articles[ticker] = scrape_and_process(cleaned_url[ticker])

articles

{'MNMD': ['Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  'Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  'Last week in Psychedelics: Field Trip Begins Trading On The Nasdaq MindMed Launches Phase 1 Trials On DMT Atai Launches New "Nose-To-Brain" Drug Delivery Company  Braxia Scientific Receives Funding From Canadian Government To Study Ketamine Therapy AOC\'s Amendment To Allow Federal Research Into Benefits Of Psychedelics Rejected By House Again Australia Launches Psychedelics Research Institute With',
  'NEW YORK, Aug. 12, 2021 /CNW/ -- MindMed (Nasdaq: MNMD), (NEO: MMED), (DE: MMQ), a leading biotech company developing psychedelic-inspired therapies, has announced its quarterly financial results for the quarter ended June 30, 2021. Q2 2021 Financial Highlights (in USD) Total assets as of June 30, 2021 were $194 million, including $157 million in cash Net Cash Used in Operating Activities of $12 m',

# 4.4 Summarize all articles

In [17]:
def summarize_text(articles):
    summaries = []
    for article in articles:
        # Tokenize our text
        # If you want to run the code in Tensorflow, please remember to return the particular tensors as simply as using return_tensors = 'tf'
        input_ids = tokenizer(article, return_tensors="pt").input_ids

        # Generate the output (Here, we use beam search but you can also use any other strategy you like)
        output = model.generate(
            input_ids, 
            max_length=32, 
            num_beams=5, 
            early_stopping=True
        )

        # Finally, we can print the generated summary
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [20]:
summaries = {}
for ticker in monitored_tickers:
    summaries[ticker] = summarize_text(articles[ticker])

In [21]:
summaries["TSLA"]

['Musk is not a big fan of traditional advertising.',
 'Tesla has not met its self-drive goal, CEO says.',
 'Tesla faces probe into 11 crashes using software.',
 'Clover Health, AMC Entertainment, Alibaba, ContextLogic among top stocks on Reddit.',
 'Clover Health, ContextLogic, AMC Entertainment and Tesla among top stocks on Reddit.',
 'Over 2,000 Benzinga visitors weigh in on clean energy stocks.',
 'Oppenheimer says shortage of AI engineers is a concern.',
 'European Commission has set ambitious targets for EV use. 5 cheap EV stocks to buy now',
 'Federal probe is a "bit of a black eye" for Tesla, says Wedbush.',
 'Dogecoin is being used as a payment option for pre-owned Teslas.']

# 5. Adding Sentiment Analysis

In [22]:
from transformers import pipeline
sentiment = pipeline("sentiment-analysis")

In [23]:
scores = {}
for ticker in monitored_tickers:
    scores[ticker] = sentiment(summaries[ticker])

In [24]:
scores

{'MNMD': [{'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9773623943328857},
  {'label': 'POSITIVE', 'score': 0.9986666440963745},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9972837567329407},
  {'label': 'POSITIVE', 'score': 0.9986711144447327},
  {'label': 'POSITIVE', 'score': 0.9657931923866272},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554}],
 'GME': [{'label': 'NEGATIVE', 'score': 0.9914673566818237},
  {'label': 'NEGATIVE', 'score': 0.9939145445823669},
  {'label': 'POSITIVE', 'score': 0.9939294457435608},
  {'label': 'NEGATIVE', 'score': 0.9775029420852661},
  {'label': 'POSITIVE', 'score': 0.9972121119499207},
  {'label': 'POSITIVE', 'score': 0.9300052523612976},
  {'label': 'NEGATIVE', 'score': 0.9613592624664307},
  {'label': 'NEGATIVE', 'score': 0.9042304158210754},
  {'label': 

In [34]:
print(summaries["TSLA"][0], scores["TSLA"][0]["label"], scores["TSLA"][0]["score"])

Musk is not a big fan of traditional advertising. NEGATIVE 0.9992798566818237


# Export to CSV

In [73]:
def create_output_array(summaries, scores, url):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]["label"],
                scores[ticker][counter]["score"],
                url[ticker][counter]
            ]
            output.append(output_this)
    return output

In [77]:
final_output = create_output_array(summaries, scores, cleaned_url)

In [78]:
final_output.insert(0, ["Ticker", "Summaries", "Label", "Confidence", "URL"])

In [79]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)