# 1. Install and Import Baseline Dependencies

In [2]:
# !pip install transformers

In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Summarize a Single Article

In [4]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [9]:
paragraphs[0].text

'Thank you for your patience.'

In [10]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [11]:
ARTICLE

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

In [12]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [13]:
summary

'We are aware of the issue and are working to resolve it.'

# 4. Building a News and Sentiment Pipeline

In [15]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [16]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [17]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&sca_esv=576753509&ie=UTF-8&gbv=1&sei=Fxc6Zd_QA_jP1sQP-IeMsAk',
  '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUIBygC',
  '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICCgD',
  '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICigF',
  '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&sca_esv=57675350

In [18]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQOwgC',
 '/search?q=yahoo+finance+GME&tbm=nws&sca_esv=576753509&ie=UTF-8&gbv=1&sei=Fxc6Zd_QA_jP1sQP-IeMsAk',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUIBygC',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICCgD',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICSgE',
 'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICigF',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwifuM6WmpOCAxX4p5UCHfgDA5YQ_AUICygG',
 '/advanced_search',
 '/search?q=yahoo+finance+GME&sca_esv=576753509&ie=UTF-8&tbm=nw

## 4.2. Strip out unwanted URLs

In [19]:
import re

In [20]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [21]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [22]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://ca.finance.yahoo.com/news/market-lessons-what-gamestop-flop-teach-investing-strategy-143742509.html',
  'https://finance.yahoo.com/news/stocks-rise-as-oil-bond-yields-finally-take-a-breather-stock-market-news-today-200406227.html',
  'https://finance.yahoo.com/news/nasdaq-rises-dow-falls-in-seesaw-start-to-q4-stock-market-news-today-200428762.html',
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.html',
  'https://finance.yahoo.com/news/introducing-dumb-money-game-changing-220000035.html',
  'https://finance.yahoo.com/news/usf-health-tampa-general-again-123000669.html',
  'https://finance.yahoo.com/news/20-highest-paying-countries-doctors-141147787.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/video/gamestop-stock-rises-ryan-cohen-140155496.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html',
  'https://finance.yahoo.com/news/gwti-inks-letter-intent-g

## 4.3. Search and Scrape Cleaned URLs

In [23]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [24]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ['Dovish comments from Fed officials on effects of bond yield surge helping support mood Market Lessons is a series that explores the biggest wins, losses and lessons across a variety of investor experiences. We spoke to Rob Khazzam, chief executive officer and co-founder of Float, a Canadian fintech startup. A few years before the infamous GameStop (GME) short squeeze of January 2021 – the one that would be a part of what is known as the meme stock phenomenon – Khazzam saw opportunity with the video game retailer. He had previous success investing in GameStop, first buying shares in the company in 2010 and selling them three years later, tripling his investment. Khazzam bought shares again in 2017. The company was struggling, but still "generating a decent amount of cash flow," Khazzam said. He thought the business was undervalued and viewed it as an opportunity. "They were trading at a very low multiple of their earnings," he told Yahoo Finance Canada in an interview. "The pr

In [25]:
articles['TSLA'][2]

'The road ahead is looking bumpier for Tesla (TSLA). Days after reporting earnings that landed below Wall Street\'s expectations, the EV maker disclosed that its automated driving systems are under deepening federal scrutiny. In a filing Monday with the Securities and Exchange Commission, Tesla said it had received requests from the Justice Department that included subpoenas asking it to turn over documents relating to its autopilot and full self-driving features. Tesla said it also received requests for information, "including subpoenas," on a wide range of other issues "regarding certain matters associated with personal benefits, related parties, vehicle range, and personnel decisions." In August, the Wall Street Journal reported that federal prosecutors were looking into concerns that Tesla’s company funds may have been used to create a glass house commissioned by CEO Elon Musk. The company didn\'t say in its filing whether the glass house was among the issues under scrutiny by the 

## 4.4. Summarise all Articles

In [26]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [27]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Rob Khazzam bought shares in struggling video game retailer in 2017.',
  'Fresh estimate for second-quarter GDP came in unchanged at 2.1%.',
  'Tech-heavy Nasdaq leads way to start new quarter higher. ISM Manufacturing Index for September comes in stronger than expected',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'A crypto project on the Ethereum Chain aims to shake up the financial system. Community comes together for the first time to film a movie',
  'New anesthesiology residency program is accredited by the Accreditation Council for Graduate Medical Education.',
  'Glassdoor ranks the 20 highest-paying countries for doctors. Community Health Systems has improved patient outcomes by implementing a remote patient monitoring program',
  'All images are copyrighted.',
  'We are aware of the issue and are working to resolve it.',
  'Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in t

In [28]:
summaries['BTC']

['We are aware of the issue and are working to resolve it.',
 'Largest cryptocurrency has surged by 12% in the last 48 hours. Bitcoin spot ETF to be listed with DTCC this month',
 'Rally is not about Bitcoin ETF optimism, but about risk appetite. Bitcoin could continue to rally if risk appetite improves',
 'Court overturns SEC’s rejection of Grayscale fund. Bitcoin climbs above $31,000 for first time since July',
 'A Bitcoin ETF could unlock an estimated $600 billion in new demand. Potential approval could also set a precedent for other countries',
 'Co-founder of BitMEX blames hawkish U.S. policy for crypto rally. Bitcoin to rise on fears of global inflation: Hayes',
 'Borroe Finance is leading the crypto race because of its Web3 financing platform. Solana’s adoption rate is going through the roof because it’s a top crypto to buy',
 'All images are copyrighted.',
 'The cryptocurrency has more than doubled since the start of the year. A number of applications for Bitcoin ETFs are still

# 5. Adding Sentiment Analysis

In [29]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [30]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.8980571627616882},
 {'label': 'NEGATIVE', 'score': 0.9926819205284119},
 {'label': 'NEGATIVE', 'score': 0.9425058960914612},
 {'label': 'NEGATIVE', 'score': 0.590732753276825},
 {'label': 'NEGATIVE', 'score': 0.9877322912216187},
 {'label': 'POSITIVE', 'score': 0.6432871222496033},
 {'label': 'NEGATIVE', 'score': 0.9880996346473694},
 {'label': 'POSITIVE', 'score': 0.7488734126091003},
 {'label': 'NEGATIVE', 'score': 0.9996650218963623},
 {'label': 'NEGATIVE', 'score': 0.99250727891922}]

In [31]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.6601110696792603},
  {'label': 'NEGATIVE', 'score': 0.9919314980506897},
  {'label': 'POSITIVE', 'score': 0.7117865681648254},
  {'label': 'NEGATIVE', 'score': 0.8664737343788147},
  {'label': 'POSITIVE', 'score': 0.9736675024032593},
  {'label': 'POSITIVE', 'score': 0.9923668503761292},
  {'label': 'POSITIVE', 'score': 0.9948858618736267},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9992434978485107},
  {'label': 'NEGATIVE', 'score': 0.9839251041412354}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9985221028327942},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9826707243919373},
  {'label': 'NEGATIVE', 'score': 0.9354450106620789},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.944806694984436},
  {'label': 'POSITIVE', 'score': 0.9996185302734375},
  {'label': '

In [32]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

Relative Strength Index (RSI) reading for GME is 29.62. NEGATIVE 0.8664737343788147


In [33]:
scores['BTC'][0]['score']

0.9979088306427002

# 6. Exporting Results to CSV

In [34]:
summaries

{'GME': ['Rob Khazzam bought shares in struggling video game retailer in 2017.',
  'Fresh estimate for second-quarter GDP came in unchanged at 2.1%.',
  'Tech-heavy Nasdaq leads way to start new quarter higher. ISM Manufacturing Index for September comes in stronger than expected',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'A crypto project on the Ethereum Chain aims to shake up the financial system. Community comes together for the first time to film a movie',
  'New anesthesiology residency program is accredited by the Accreditation Council for Graduate Medical Education.',
  'Glassdoor ranks the 20 highest-paying countries for doctors. Community Health Systems has improved patient outcomes by implementing a remote patient monitoring program',
  'All images are copyrighted.',
  'We are aware of the issue and are working to resolve it.',
  'Video game retailer reported better-than-expected results in second quarter. Shares of the company have lost about 22.1% in t

In [35]:
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.6601110696792603},
  {'label': 'NEGATIVE', 'score': 0.9919314980506897},
  {'label': 'POSITIVE', 'score': 0.7117865681648254},
  {'label': 'NEGATIVE', 'score': 0.8664737343788147},
  {'label': 'POSITIVE', 'score': 0.9736675024032593},
  {'label': 'POSITIVE', 'score': 0.9923668503761292},
  {'label': 'POSITIVE', 'score': 0.9948858618736267},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9992434978485107},
  {'label': 'NEGATIVE', 'score': 0.9839251041412354}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9985221028327942},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9826707243919373},
  {'label': 'NEGATIVE', 'score': 0.9354450106620789},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.944806694984436},
  {'label': 'POSITIVE', 'score': 0.9996185302734375},
  {'label': '

In [36]:
cleaned_urls

{'GME': ['https://ca.finance.yahoo.com/news/market-lessons-what-gamestop-flop-teach-investing-strategy-143742509.html',
  'https://finance.yahoo.com/news/stocks-rise-as-oil-bond-yields-finally-take-a-breather-stock-market-news-today-200406227.html',
  'https://finance.yahoo.com/news/nasdaq-rises-dow-falls-in-seesaw-start-to-q4-stock-market-news-today-200428762.html',
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.html',
  'https://finance.yahoo.com/news/introducing-dumb-money-game-changing-220000035.html',
  'https://finance.yahoo.com/news/usf-health-tampa-general-again-123000669.html',
  'https://finance.yahoo.com/news/20-highest-paying-countries-doctors-141147787.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/video/gamestop-stock-rises-ryan-cohen-140155496.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-down-22-153004926.html',
  'https://finance.yahoo.com/news/gwti-inks-letter-intent-g

In [37]:
range(len(summaries['GME']))

range(0, 11)

In [38]:
summaries['GME'][4]

'A crypto project on the Ethereum Chain aims to shake up the financial system. Community comes together for the first time to film a movie'

In [39]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [40]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Rob Khazzam bought shares in struggling video game retailer in 2017.',
  'POSITIVE',
  0.6601110696792603,
  'https://ca.finance.yahoo.com/news/market-lessons-what-gamestop-flop-teach-investing-strategy-143742509.html'],
 ['GME',
  'Fresh estimate for second-quarter GDP came in unchanged at 2.1%.',
  'NEGATIVE',
  0.9919314980506897,
  'https://finance.yahoo.com/news/stocks-rise-as-oil-bond-yields-finally-take-a-breather-stock-market-news-today-200406227.html'],
 ['GME',
  'Tech-heavy Nasdaq leads way to start new quarter higher. ISM Manufacturing Index for September comes in stronger than expected',
  'POSITIVE',
  0.7117865681648254,
  'https://finance.yahoo.com/news/nasdaq-rises-dow-falls-in-seesaw-start-to-q4-stock-market-news-today-200428762.html'],
 ['GME',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'NEGATIVE',
  0.8664737343788147,
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.html'],
 ['GME',
  'A crypto project on the Ethereum C

In [41]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [42]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Rob Khazzam bought shares in struggling video game retailer in 2017.',
  'POSITIVE',
  0.6601110696792603,
  'https://ca.finance.yahoo.com/news/market-lessons-what-gamestop-flop-teach-investing-strategy-143742509.html'],
 ['GME',
  'Fresh estimate for second-quarter GDP came in unchanged at 2.1%.',
  'NEGATIVE',
  0.9919314980506897,
  'https://finance.yahoo.com/news/stocks-rise-as-oil-bond-yields-finally-take-a-breather-stock-market-news-today-200406227.html'],
 ['GME',
  'Tech-heavy Nasdaq leads way to start new quarter higher. ISM Manufacturing Index for September comes in stronger than expected',
  'POSITIVE',
  0.7117865681648254,
  'https://finance.yahoo.com/news/nasdaq-rises-dow-falls-in-seesaw-start-to-q4-stock-market-news-today-200428762.html'],
 ['GME',
  'Relative Strength Index (RSI) reading for GME is 29.62.',
  'NEGATIVE',
  0.8664737343788147,
  'https://finance.yahoo.com/news/down-19-41-4-weeks-133506409.

In [43]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [46]:
import pandas as pd
df=pd.read_csv('assetsummaries.csv')
df.head()

Unnamed: 0,Ticker,Summary,Label,Confidence,URL
0,GME,Rob Khazzam bought shares in struggling video ...,POSITIVE,0.660111,https://ca.finance.yahoo.com/news/market-lesso...
1,GME,Fresh estimate for second-quarter GDP came in ...,NEGATIVE,0.991931,https://finance.yahoo.com/news/stocks-rise-as-...
2,GME,Tech-heavy Nasdaq leads way to start new quart...,POSITIVE,0.711787,https://finance.yahoo.com/news/nasdaq-rises-do...
3,GME,Relative Strength Index (RSI) reading for GME ...,NEGATIVE,0.866474,https://finance.yahoo.com/news/down-19-41-4-we...
4,GME,A crypto project on the Ethereum Chain aims to...,POSITIVE,0.973668,https://finance.yahoo.com/news/introducing-dum...


In [48]:
df[['Summary','Label']]

Unnamed: 0,Summary,Label
0,Rob Khazzam bought shares in struggling video ...,POSITIVE
1,Fresh estimate for second-quarter GDP came in ...,NEGATIVE
2,Tech-heavy Nasdaq leads way to start new quart...,POSITIVE
3,Relative Strength Index (RSI) reading for GME ...,NEGATIVE
4,A crypto project on the Ethereum Chain aims to...,POSITIVE
5,New anesthesiology residency program is accred...,POSITIVE
6,Glassdoor ranks the 20 highest-paying countrie...,POSITIVE
7,All images are copyrighted.,NEGATIVE
8,We are aware of the issue and are working to r...,POSITIVE
9,Video game retailer reported better-than-expec...,NEGATIVE
