# 1. Install and Import Baseline Dependencies

In [2]:
# !pip install transformers

In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [4]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Summarize a Single Article

In [5]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
paragraphs[0].text

'Thank you for your patience.'

In [7]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [8]:
ARTICLE

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

In [9]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [10]:
summary

'We are aware of the issue and are working to resolve it.'

# 4. Building a News and Sentiment Pipeline

In [11]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [12]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [13]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&sca_esv=573962864&ie=UTF-8&gbv=1&sei=6WUuZfJdwdTk5Q-qgIugDg',
  '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUIBygC',
  '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICCgD',
  '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICigF',
  '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&sca_esv=573962864

In [14]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQOwgC',
 '/search?q=yahoo+finance+GME&tbm=nws&sca_esv=573962864&ie=UTF-8&gbv=1&sei=6WUuZfJdwdTk5Q-qgIugDg',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUIBygC',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICCgD',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICSgE',
 'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICigF',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiy9prU8_yBAxVBKrkGHSrAAuQQ_AUICygG',
 '/advanced_search',
 '/search?q=yahoo+finance+GME&sca_esv=573962864&ie=UTF-8&tbm=nws

## 4.2. Strip out unwanted URLs

In [15]:
import re

In [16]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [17]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [18]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html',
  'https://www.vox.com/money/2023/9/15/23873474/dumb-money-gamestop-stock-keith-gill-melvin-capital-review',
  'https://www.wsj.com/articles/the-lessons-of-dumb-money-gamestop-stock-craze-movie-daa0d0bf',
  'https://www.marketwatch.com/data-news/gamestop-corp-cl-a-stock-underperforms-monday-when-compared-to-competitors-9d618c15-f2db90be2f39',
  'https://investorplace.com/2023/10/gme-stock-is-headed-for-zero-even-with-ryan-cohen-in-the-ceo-seat/',
  'https://www.cnbc.com/2023/09/28/gamestop-names-ryan-cohen-as-ceo-effective-immediately.html',
  'https://www.cnbc.com/2023/09/15/heres-where-amc-and-gamestop-are-now-as-dumb-money-hits-theaters.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://fortune.com/2023/09/28/gamestop-names-ryan-cohen-ceo-meme-stock-chewy-founder/',
  'https://www.theatlantic.com/culture/archive/2023/09/dumb-money-movie-review/675419/

## 4.3. Search and Scrape Cleaned URLs

In [19]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [20]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["It has been about a month since the last earnings report for GameStop (GME). Shares have lost about 22.1% in that time frame, underperforming the S&P 500. Will the recent negative trend continue leading up to its next earnings release, or is GameStop due for a breakout? Before we dive into how investors and analysts have reacted as of late, let's take a quick look at its most recent earnings report in order to get a better handle on the important catalysts. GameStop posted second-quarter fiscal 2023 results, delivering a narrower-than-expected loss per share and better-than-expected revenues. The top and bottom lines increased from their respective year-ago quarter’s reported figures. GameStop posted an adjusted loss of 3 cents per share in second-quarter fiscal 2023, narrower than the Zacks Consensus Estimate of a loss of 15 cents. The company had incurred an adjusted loss per share of 35 cents in the prior year quarter.GME reported net sales of $1,163.8 million, which surpa

In [21]:
articles['TSLA'][2]

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

## 4.4. Summarise all Articles

In [22]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [23]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in the past month',
  'Back in early 2021, a ragtag group of everyday traders took on Wall Street. But two and a half years later, the story is a whole lot messier',
  'Rough Rough cut (no reporter narration)',
  'Shares of the video-game retailer have fallen below their 50-day moving average.',
  'Ryan Cohen, former Chewy CEO, takes no pay for job. Shares have dropped 23% since his appointment, suggesting doubts',
  'Find the best credit cards, loans, insurance and more in SELECT.',
  'Find the best credit cards, loans, insurance and more in SELECT.',
  'All images are copyrighted.',
  'Cohen already serves as chairman of the board. Video game retailer has been without a CEO since June',
  'Director Craig Gillespie’s film is a diverting, high-energy romp.',
  'Have you tried going to Newsround?'],
 'TSLA': ['Walter Isaacson says he interviewed both men mor

In [24]:
summaries['BTC']

['The supply of bitcoin held by short-term holders is at its lowest point in nearly eight years.',
 'GBTC market cap is close to $3.5 billion based on Yahoo Finance data. discount on GBTC shares against net asset value has reduced',
 '‘I can’t go out with gold because gold is not good,’ says Cramer. Bitcoin is about to go down big, he says',
 'Bitcoin falls below $27,000, underperforming CoinDesk Market Index. Ripple Labs-related, litecoin and Polkadot’s native token drop',
 'Biggest miner CleanSpark offers best relative value, bank says. Bitcoin ETF decision delayed by SEC until this month',
 'Bitcoin is a monetary good, not a ‘first-mover’ technology, report says.',
 'Asset manager’s application for a spot market ETF is still under review. Demand for traditional financial instruments is pent up',
 'All images are copyrighted.',
 'More than $85 million worth of trading positions liquidated in 24 hours. BlackRock says its ETF application is still under review',
 'We are aware of the is

# 5. Adding Sentiment Analysis

In [25]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [26]:
sentiment(summaries['BTC'])

[{'label': 'NEGATIVE', 'score': 0.9996198415756226},
 {'label': 'NEGATIVE', 'score': 0.9991340041160583},
 {'label': 'NEGATIVE', 'score': 0.9989274144172668},
 {'label': 'NEGATIVE', 'score': 0.9991093277931213},
 {'label': 'NEGATIVE', 'score': 0.9797898530960083},
 {'label': 'POSITIVE', 'score': 0.9741136431694031},
 {'label': 'NEGATIVE', 'score': 0.597372829914093},
 {'label': 'NEGATIVE', 'score': 0.9880996346473694},
 {'label': 'NEGATIVE', 'score': 0.9911858439445496},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9954965114593506}]

In [27]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9992434978485107},
  {'label': 'NEGATIVE', 'score': 0.9988741278648376},
  {'label': 'NEGATIVE', 'score': 0.9994315505027771},
  {'label': 'NEGATIVE', 'score': 0.99901282787323},
  {'label': 'NEGATIVE', 'score': 0.9990560412406921},
  {'label': 'POSITIVE', 'score': 0.9378337860107422},
  {'label': 'POSITIVE', 'score': 0.9378337860107422},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9766747951507568},
  {'label': 'POSITIVE', 'score': 0.9998674392700195},
  {'label': 'NEGATIVE', 'score': 0.9915708303451538}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9606534838676453},
  {'label': 'NEGATIVE', 'score': 0.99946528673172},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9522320032119751},
  {'label': 'NEGATIVE', 'score': 0.9354450106620789},
  {'label': 'NEGATIVE', 'score': 0.997977077960968},
  {'label': 'NEGATIVE', 'score': 0.9676677584648132},
  {'label': 'NEGA

In [28]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

Shares of the video-game retailer have fallen below their 50-day moving average. NEGATIVE 0.99901282787323


In [29]:
scores['BTC'][0]['score']

0.9996198415756226

# 6. Exporting Results to CSV

In [30]:
summaries

{'GME': ['Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in the past month',
  'Back in early 2021, a ragtag group of everyday traders took on Wall Street. But two and a half years later, the story is a whole lot messier',
  'Rough Rough cut (no reporter narration)',
  'Shares of the video-game retailer have fallen below their 50-day moving average.',
  'Ryan Cohen, former Chewy CEO, takes no pay for job. Shares have dropped 23% since his appointment, suggesting doubts',
  'Find the best credit cards, loans, insurance and more in SELECT.',
  'Find the best credit cards, loans, insurance and more in SELECT.',
  'All images are copyrighted.',
  'Cohen already serves as chairman of the board. Video game retailer has been without a CEO since June',
  'Director Craig Gillespie’s film is a diverting, high-energy romp.',
  'Have you tried going to Newsround?'],
 'TSLA': ['Walter Isaacson says he interviewed both men mor

In [31]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9992434978485107},
  {'label': 'NEGATIVE', 'score': 0.9988741278648376},
  {'label': 'NEGATIVE', 'score': 0.9994315505027771},
  {'label': 'NEGATIVE', 'score': 0.99901282787323},
  {'label': 'NEGATIVE', 'score': 0.9990560412406921},
  {'label': 'POSITIVE', 'score': 0.9378337860107422},
  {'label': 'POSITIVE', 'score': 0.9378337860107422},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9766747951507568},
  {'label': 'POSITIVE', 'score': 0.9998674392700195},
  {'label': 'NEGATIVE', 'score': 0.9915708303451538}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9606534838676453},
  {'label': 'NEGATIVE', 'score': 0.99946528673172},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9522320032119751},
  {'label': 'NEGATIVE', 'score': 0.9354450106620789},
  {'label': 'NEGATIVE', 'score': 0.997977077960968},
  {'label': 'NEGATIVE', 'score': 0.9676677584648132},
  {'label': 'NEGA

In [32]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html',
  'https://www.vox.com/money/2023/9/15/23873474/dumb-money-gamestop-stock-keith-gill-melvin-capital-review',
  'https://www.wsj.com/articles/the-lessons-of-dumb-money-gamestop-stock-craze-movie-daa0d0bf',
  'https://www.marketwatch.com/data-news/gamestop-corp-cl-a-stock-underperforms-monday-when-compared-to-competitors-9d618c15-f2db90be2f39',
  'https://investorplace.com/2023/10/gme-stock-is-headed-for-zero-even-with-ryan-cohen-in-the-ceo-seat/',
  'https://www.cnbc.com/2023/09/28/gamestop-names-ryan-cohen-as-ceo-effective-immediately.html',
  'https://www.cnbc.com/2023/09/15/heres-where-amc-and-gamestop-are-now-as-dumb-money-hits-theaters.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://fortune.com/2023/09/28/gamestop-names-ryan-cohen-ceo-meme-stock-chewy-founder/',
  'https://www.theatlantic.com/culture/archive/2023/09/dumb-money-movie-review/675419/

In [33]:
range(len(summaries['GME']))

range(0, 11)

In [40]:
summaries['GME'][4]

'Ryan Cohen, former Chewy CEO, takes no pay for job. Shares have dropped 23% since his appointment, suggesting doubts'

In [41]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [42]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in the past month',
  'NEGATIVE',
  0.9992434978485107,
  'https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html'],
 ['GME',
  'Back in early 2021, a ragtag group of everyday traders took on Wall Street. But two and a half years later, the story is a whole lot messier',
  'NEGATIVE',
  0.9988741278648376,
  'https://www.vox.com/money/2023/9/15/23873474/dumb-money-gamestop-stock-keith-gill-melvin-capital-review'],
 ['GME',
  'Rough Rough cut (no reporter narration)',
  'NEGATIVE',
  0.9994315505027771,
  'https://www.wsj.com/articles/the-lessons-of-dumb-money-gamestop-stock-craze-movie-daa0d0bf'],
 ['GME',
  'Shares of the video-game retailer have fallen below their 50-day moving average.',
  'NEGATIVE',
  0.99901282787323,
  'https://www.marketwatch.com/data-news/gamestop-corp-cl-a-stock-underperforms-monday-when-compared-to-competitors-9

In [43]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [44]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in the past month',
  'NEGATIVE',
  0.9992434978485107,
  'https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html'],
 ['GME',
  'Back in early 2021, a ragtag group of everyday traders took on Wall Street. But two and a half years later, the story is a whole lot messier',
  'NEGATIVE',
  0.9988741278648376,
  'https://www.vox.com/money/2023/9/15/23873474/dumb-money-gamestop-stock-keith-gill-melvin-capital-review'],
 ['GME',
  'Rough Rough cut (no reporter narration)',
  'NEGATIVE',
  0.9994315505027771,
  'https://www.wsj.com/articles/the-lessons-of-dumb-money-gamestop-stock-craze-movie-daa0d0bf'],
 ['GME',
  'Shares of the video-game retailer have fallen below their 50-day moving average.',
  'NEGATIVE',
  0.99901282787323,
  'https://www.marketwatch.com/data-news/gamestop-corp-cl-a-sto

In [45]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)