# 1. Install and Import Baseline Dependencies

In [1]:
#!pip install transformers

In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a Single Article

In [4]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [5]:
paragraphs[0].text

'Thank you for your patience.'

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [7]:
ARTICLE

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55,
                        num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'We are aware of the issue and are working to resolve it.'

# 4. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [12]:
raw_urls = {ticker: search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=pALOY-jSBJGckPIP_KWC2A0',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICCgD',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICSgE',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=U

In [13]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQOwgC',
 '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=pALOY-jSBJGckPIP_KWC2A0',
 '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUIBygC',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICCgD',
 'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICSgE',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICigF',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQ_AUICygG',
 '/advanced_search',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwjotLC-49z8AhURDkQIHfySANsQpwUIDQ',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&sourc

## 4.2. Strip out unwanted URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker: strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/down-28-91-4-weeks-143502058.html',
  'https://finance.yahoo.com/news/mark-cuban-stock-portfolio-10-211950824.html',
  'https://finance.yahoo.com/news/why-game-stop-stock-is-probably-dead-money-for-a-while-according-to-one-analyst-180851839.html',
  'https://finance.yahoo.com/news/after-hours-stock-movers-game-stop-rent-the-runway-c-3-ai-and-more-232228102.html',
  'https://finance.yahoo.com/news/meme-stock-mania-5-lessons-220010930.html',
  'https://finance.yahoo.com/news/the-game-stop-turnaround-promise-is-failing-111117027.html',
  'https://finance.yahoo.com/news/meme-stock-billionaire-alibaba-wager-094812107.html',
  'https://finance.yahoo.com/news/chamath-palihapitiya-stocks-10-stocks-160502053.html',
  'https://finance.yahoo.com/news/my-meme-stock-fiasco-153417561.html',
  'https://finance.yahoo.com/news/morningstar-ceos-message-to-meme-stock-investors-162841912.html'],
 'TSLA': ['https://finance.yahoo.com/video/tech-earnings-expected-next

## 4.3. Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [None]:
articles = {ticker: scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

In [None]:
articles['TSLA'][2]

## 4.4. Summarise all Articles

In [None]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55,
                                num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [None]:
summaries = {ticker: summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

In [None]:
summaries['BTC']

# 5. Adding Sentiment Analysis

In [None]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [None]:
sentiment(summaries['BTC'])

In [None]:
scores = {ticker: sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

In [None]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

In [None]:
scores['BTC'][0]['score']

# 6. Exporting Results to CSV

In [None]:
summaries

In [None]:
scores

In [None]:
cleaned_urls

In [None]:
range(len(summaries['GME']))

In [None]:
summaries['GME'][3]

In [None]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [None]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

In [None]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [None]:
final_output

In [None]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)