## Install dependencies

In [1]:
!pip install transformers



In [2]:
! pip install sentencepiece



In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [4]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [5]:
url = 'https://finance.yahoo.com/news/tesla-evs-can-now-scan-the-road-for-potholes-and-adjust-the-suspension-height-113844466.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = " ".join(text).split(' ')[:400]
ARTICLE = " ".join(words)

In [7]:
ARTICLE

'Tesla has introduced a software update that allows its vehicles to scan for potholes, broken pavement and other defects, Electrek has reported. It can then use that to generate "rough road map data," and trigger the adaptive suspension in supported vehicles to adjust the ride height for more comfort. Back in 2020, Musk tweeted that such a feature was coming, and this appears to be the first step. "This adjustment may occur at various locations, subject to availability, as the vehicle downloads rough road map data generated by Tesla cars," the release notes state. That means pothole and other data should become increasingly refined as Tesla vehicles ply the roads. The ride adjustment will only work in Tesla Model S and Model X cars with adaptive suspensions, Elektrek notes. It\'s not clear if the Model 3 or Y vehicles also scan for rough roads, even if they lack the adaptive suspension to benefit from the data. Both the Model 3 and the Model S have eight cameras in total. To enable the

In [8]:
input_ids = tokenizer.encode(ARTICLE,return_tensors='pt')
output = model.generate(input_ids, max_length=100, num_beams = 5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'Autonomous driving features were promised in 2020.'

## Building Pipeline

In [10]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [11]:
def search_for_news_url(ticker):
  search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
  r = requests.get(search_url)
  soup = BeautifulSoup(r.text, 'html.parser')
  a_tags = soup.find_all('a')
  hrefs = [link['href'] for link in a_tags]
  return hrefs

In [12]:
search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format('TSLA')
r = requests.get(search_url)
soup = BeautifulSoup(r.text, 'html.parser')
a_tags = soup.find_all('a')
a_tags

[<a href="/?sa=X&amp;ved=0ahUKEwju7daD5OT4AhVmm9gFHesQCeAQOwgC"><span class="V6gwVd">G</span><span class="iWkuvd">o</span><span class="cDrQ7">o</span><span class="V6gwVd">g</span><span class="ntlR9">l</span><span class="iWkuvd tJ3Myc">e</span></a>,
 <a href="/search?q=yahoo+finance+TSLA&amp;tbm=nws&amp;ie=UTF-8&amp;gbv=1&amp;sei=ecXFYq7NEOa24t4P66GkgA4">here</a>,
 <a class="eZt8xd" href="/search?q=yahoo+finance+TSLA&amp;ie=UTF-8&amp;source=lnms&amp;sa=X&amp;ved=0ahUKEwju7daD5OT4AhVmm9gFHesQCeAQ_AUIBSgA">All</a>,
 <a class="eZt8xd" href="/search?q=yahoo+finance+TSLA&amp;ie=UTF-8&amp;tbm=bks&amp;source=lnms&amp;sa=X&amp;ved=0ahUKEwju7daD5OT4AhVmm9gFHesQCeAQ_AUIBygC">Books</a>,
 <a class="eZt8xd" href="/search?q=yahoo+finance+TSLA&amp;ie=UTF-8&amp;tbm=shop&amp;source=lnms&amp;sa=X&amp;ved=0ahUKEwju7daD5OT4AhVmm9gFHesQCeAQ_AUICCgD">Shopping</a>,
 <a href="/search?q=yahoo+finance+TSLA&amp;ie=UTF-8&amp;tbm=isch&amp;source=lnms&amp;sa=X&amp;ved=0ahUKEwju7daD5OT4AhVmm9gFHesQCeAQ_AUICSgE">Image

In [13]:
raw_urls = {ticker:search_for_news_url(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=fsXFYt66MYmJ3LUP9_WMmAU',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwje8aiG5OT4AhWJBLcAHfc6A1MQpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=U

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'accounts', 'preferences', 'support', 'consent']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
  val = []
  for url in urls:
    if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
      res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
      val.append(res)
  return list(set(val))

In [18]:
cleaned_urls = {ticker:strip_unwanted_urls(urls, exclude_list) for ticker, urls in raw_urls.items()}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/analysis-shorts-circle-gamestop-amc-050545286.html',
  'https://finance.yahoo.com/news/15-best-penny-stocks-invest-142312669.html',
  'https://finance.yahoo.com/news/business-school-graduates-enter-white-140000655.html',
  'https://finance.yahoo.com/news/celsius-defies-fear-implosion-token-110843077.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-sinks-market-220010614.html',
  'https://finance.yahoo.com/news/morning-brief-june-14-2022-100052802.html',
  'https://finance.yahoo.com/news/amc-gme-have-seen-the-apex-of-their-interest-strategist-203140661.html',
  'https://finance.yahoo.com/news/robinhood-almost-imploded-during-gamestop-205051551.html',
  'https://www.thestreet.com/memestocks/gme/gamestop-stocks-direct-registration-is-increasing-free-float-is-being-locked-up',
  'https://finance.yahoo.com/news/td-ameritrade-investor-movement-index-163000660.html'],
 'TSLA': ['https://finance.yahoo.com/news/elephant-in-the-room-for-tesla-d

In [27]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        # print(url)
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = " ".join(text).split(' ')[:300]
        ARTICLE = " ".join(words)
        # print(ARTICLE)
        ARTICLES.append(ARTICLE)
    return ARTICLES


In [None]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

In [29]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article,return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams = 5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [30]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Retail investors have been net sellers of single stocks. Hedge funds have become more dominant short sellers',
  '15 best penny stocks to invest in. These stocks provide investors with decent short-term returns',
  'Survey finds 92 percent of corporate recruiters expecting to hire MBAs. East, Southeast Asia and Middle East recruiters most confident',
  'Celsius token has soared 218% since June 13. Investors targeting short sellers who profited from ‘bank run’',
  'We are aware of the issue and are working to resolve it.',
  '‘We pass, for now,’ BlackRock says of ‘buy the dip’ strategy',
  'We are aware of the issue and are working to resolve it.',
  'Report says FTX had no active talks to buy Gamestop. Report says regulators need to do more to regulate apps',
  'Direct Registration System (DRS) is increasingly being used by shareholders. Retail investors have been using the service for years',
  'IMXSM reading ranks ‘Moderate Low’ compared to historic averages. Ameritrade cli

In [31]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [34]:
sentiment(summaries['GME'])

[{'label': 'NEGATIVE', 'score': 0.8954309225082397},
 {'label': 'POSITIVE', 'score': 0.9990646243095398},
 {'label': 'POSITIVE', 'score': 0.9955031275749207},
 {'label': 'POSITIVE', 'score': 0.9938773512840271},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9879947304725647},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9996492862701416},
 {'label': 'NEGATIVE', 'score': 0.9811649322509766},
 {'label': 'NEGATIVE', 'score': 0.9981441497802734}]

In [35]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.8954309225082397},
  {'label': 'POSITIVE', 'score': 0.9990646243095398},
  {'label': 'POSITIVE', 'score': 0.9955031275749207},
  {'label': 'POSITIVE', 'score': 0.9938773512840271},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9879947304725647},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9996492862701416},
  {'label': 'NEGATIVE', 'score': 0.9811649322509766},
  {'label': 'NEGATIVE', 'score': 0.9981441497802734}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9908789992332458},
  {'label': 'NEGATIVE', 'score': 0.9685335755348206},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9912733435630798},
  {'label': 'POSITIVE', 'score': 0.9996529817581177},
  {'label': 'NEGATIVE', 'score': 0.997852087020874},
  {'label': 'NEGATIVE', 'score': 0.9994866847991943},
  {'label': 'POSITIVE', 'score': 0.7619246244430542},
  {'label': '

In [37]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_list = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_list)
    return output

In [38]:
final_output = create_output_array(summaries, scores, cleaned_urls)

In [40]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [None]:
final_output

In [43]:
import csv
with open('assetSummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)