In [88]:

!pip install transformers



In [89]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
!pip install sentencepiece
import sentencepiece





# 2. Setup Summarization Model

In [90]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
import os

# Disable the symlink warning
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

import requests
from transformers import PegasusForConditionalGeneration

# Your model loading code here
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Summarize a Single Article

In [92]:
url = "https://techcrunch.com/2024/07/15/elon-musk-confirms-tesla-robotaxi-event-delayed-design-change/"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')


In [93]:
paragraphs[1].text

'Tesla CEO Elon Musk confirmed Monday earlier reports the company was delaying its robotaxi reveal, explaining it was because he requested an “important design change to the front.” '

In [94]:
text = [paragraph.text for paragraph in paragraphs]
words = ''.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [95]:
ARTICLE

'CommentTesla CEO Elon Musk confirmed Monday earlier reports the company was delaying its robotaxi reveal, explaining it was because he requested an “important design change to the front.” Bloomberg News reported last week Tesla was pushing the event to October. It had been aiming for an August 8 event. The company did not immediately respond to a request for comment. Tesla shares fell more than 6% immediately following the report, but have since recovered.Musk responded to a question posed in a post on X, the social media network he owns, explaining the reason for the delay. “Requested what I think is an important design change to the front, and extra time allows us to show off a few other things,” he wrote.Musk first teased the robotaxi event in April, on the same day that Reuters reported the company was shelving plans for a new vehicle built on a next-generation platform that would cost around $25,000. Musk denied that report on his social media platform X.Tesla had internally plan

In [96]:
input_ids = tokenizer.encode(ARTICLE, return_tensors = 'pt')
output = model.generate(input_ids, max_length=55,num_beams=5, early_stopping = True)
summary = tokenizer.decode(output[0],skip_special_tokens=True)

In [97]:
summary

'Musk says he requested ‘important design change to the front’. Tesla had been aiming for August 8 event'

# 4. Building a News and Sentiment Analysis

In [98]:
monitored_tickers = ['GME', 'TSLA', 'BTC'] 

# 4.1 Search for stock news using google and yahoo finance

In [99]:
def search_for_stock_news_urls(ticker):
        search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
        r = requests.get(search_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        atags = soup.find_all('a')
        hrefs = [link['href']for link in atags]
        return hrefs

In [100]:
raw_urls ={ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQOwgC',
  '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=nws&gbv=1&sei=XXabZv7LNZ3K1sQP882P0A4',
  '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUIBygC',
  '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUICCgD',
  '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUICSgE',
  '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQiaAMCAooBQ&usg=AOvVaw2Txc_uPrdZmyIs9pLeIoAG',
  '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3Dd28cac

In [101]:
raw_urls["GME"][0]

'/?sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQOwgC'

# 4.2 Strip Out unwanted URLs

In [102]:
import re #regular expressions(re)

In [103]:
exclude_list= ['maps', 'policies', 'preferences', 'accounts', 'support']

In [104]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [105]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQOwgC',
 '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=nws&gbv=1&sei=XXabZv7LNZ3K1sQP882P0A4',
 '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUIBygC',
 '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUICCgD',
 '/search?q=yahoo+finance+GME&sca_esv=d28cac85abca5345&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQ_AUICSgE',
 '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwi-7ofXmrWHAxUdpZUCHfPmA-oQiaAMCAooBQ&usg=AOvVaw2Txc_uPrdZmyIs9pLeIoAG',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3Dd28cac85abca5345%26ie

In [106]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestops-gme-preliminary-results-highlight-160700244.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-stock-trading-153229310.html',
  'https://finance.yahoo.com/news/gamestop-releases-q1-results-ahead-111604723.html',
  'https://finance.yahoo.com/news/gamestop-gme-down-13-month-124300979.html',
  'https://finance.yahoo.com/news/gamestop-gme-rises-higher-market-214519002.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/gamestop-should-ditch-retail-and-become-a-holding-company-like-warren-buffetts-berkshire-hathaway-133106692.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-shares-trading-162132483.html',
  'https://finance.yahoo.com/news/gamestop-stock-nyse-gme-game-000228694.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-shares-trading-151315370.html',
  'https://finance.yahoo.com/news/investors-heavily-search-gamestop-corp-130015751.html']

# 4.3 Search and Scrape Cleaned URLs

In [107]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join (words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [108]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ['Thank you for your patience.',
  'Thank you for your patience.',
  'Thank you for your patience.',
  "GameStop Corp.  GME, a stock synonymous with the meme trading frenzy, has seen a substantial decline of nearly 13% in the past month. Currently trading well below its 52-week high of $64.83, achieved on May 14, 2024, the stock closed at $24.37 on Jul 3. This sharp drop has left many investors questioning the future of GameStop and how to navigate the unpredictable landscape of meme stock investments.   GameStop's rise to fame as a meme stock was nothing short of spectacular. Fueled by retail investors on platforms like Reddit's WallStreetBets, the stock saw unprecedented gains driven by short squeezes and social media hype. This frenzy turned GameStop into a symbol of the power of retail investors challenging traditional Wall Street norms. However, the 62.4% decline from its 52-week peak highlights the inherent volatility and risk associated with such stocks.  AMC Entertainme

In [109]:
articles['TSLA'][3]

'Tesla’s ( NASDAQ:TSLA ) journey in 2024 has been anything but smooth. We’ve got factory shutdowns, shipping woes, and some serious competition nipping at Tesla’s heels, especially in China. Yet, Elon Musk’s unwavering commitment to expanding Tesla’s EV lineup has kept the company on track. The recent release of Tesla’s Q2 production and delivery report has everyone talking. While the numbers show a decline compared to last year, they still managed to beat analysts’ expectations, giving the stock a much-needed boost.'

# 4.4 Summarise all Articles

In [110]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors = 'pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens= True)
        summaries.append(summary)
    return summaries

In [111]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'The stock is trading well below its 52-week high of $64.83. This sharp drop has left many investors questioning the future of GameStop',
  'Shares of the video game retailer have been moving higher in recent days.',
  'Your information may be shared with third parties.',
  'Video game retailer’s stock has lost more than half its value in the past five years.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'Shares of the video game retailer are up more than 20% this year.'],
 'TSLA': ['We will update this page as soon as possible.',
  'Shares set to extend gains on Wednesday. Tesla reported record vehicle deliveries in its first quarter',
  'Your information may be shared with third parties.',
  'Musk’s commitment to ex

In [112]:
summaries['BTC']

['A look back at some of the quirkier snippets from the news in recent times',
 'First Mover takes a look at the latest moves in crypto markets.',
 'Surging interest in Bitcoin futures has also boosted prices.',
 'A profitable crypto address has made a fresh investment in BTC.',
 'Markets have been volatile since the beginning of the year. Bitcoin (BTC) has been in focus as the world’s most popular cryptocurrency witnessed an unprecedented rise',
 'We will update this page as soon as possible.',
 'We will update this page as soon as possible.',
 "WazirX's native token WRX slumped 15% in dollar terms.",
 'The The cryptocurrency hit a one-month high after hitting a record low on Friday.',
 'We will update this page as soon as possible.',
 'Your information may be shared with third parties.']

# 5. Adding Sentiment Analysis

In [113]:
from transformers import pipeline

# Specify the model name and revision
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
revision = "af0f99b"

# Create a sentiment-analysis pipeline with the specified model and revision
sentiment = pipeline('sentiment-analysis', model=model_name, revision=revision)

# Example usage
result = sentiment("I love using the transformers library!")
print(result)


[{'label': 'POSITIVE', 'score': 0.9993904829025269}]


In [114]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9971897006034851},
 {'label': 'POSITIVE', 'score': 0.983063280582428},
 {'label': 'POSITIVE', 'score': 0.9992666840553284},
 {'label': 'POSITIVE', 'score': 0.9993797540664673},
 {'label': 'POSITIVE', 'score': 0.9951992630958557},
 {'label': 'POSITIVE', 'score': 0.9566920399665833},
 {'label': 'POSITIVE', 'score': 0.9566920399665833},
 {'label': 'NEGATIVE', 'score': 0.9996933937072754},
 {'label': 'NEGATIVE', 'score': 0.9984173774719238},
 {'label': 'POSITIVE', 'score': 0.9566920399665833},
 {'label': 'NEGATIVE', 'score': 0.9903545379638672}]

In [115]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'NEGATIVE', 'score': 0.998848557472229},
  {'label': 'POSITIVE', 'score': 0.9962789416313171},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'NEGATIVE', 'score': 0.9997506737709045},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9807196259498596}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9392716884613037},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9993440508842468},
  {'label': 'POSITIVE', 'score': 0.9767380356788635},
  {'label': 'POSITIVE', 'score': 0.9954968690872192},
  {'label': 'POSITIVE', 'score': 0.9938702583312988},
  {'label': '

In [116]:
print(summaries['TSLA'][7], scores['TSLA'][7]['label'], scores['TSLA'][7]['score'])

Shares of the electric carmaker have been on a roll in recent weeks. POSITIVE 0.9986897110939026


In [117]:
scores['BTC'][0]['score']

0.9971897006034851

## 

# 6. Exporting Results to CSV

In [118]:
summaries

{'GME': ['We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'The stock is trading well below its 52-week high of $64.83. This sharp drop has left many investors questioning the future of GameStop',
  'Shares of the video game retailer have been moving higher in recent days.',
  'Your information may be shared with third parties.',
  'Video game retailer’s stock has lost more than half its value in the past five years.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'We will update this page as soon as possible.',
  'Shares of the video game retailer are up more than 20% this year.'],
 'TSLA': ['We will update this page as soon as possible.',
  'Shares set to extend gains on Wednesday. Tesla reported record vehicle deliveries in its first quarter',
  'Your information may be shared with third parties.',
  'Musk’s commitment to ex

In [119]:
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'NEGATIVE', 'score': 0.998848557472229},
  {'label': 'POSITIVE', 'score': 0.9962789416313171},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'NEGATIVE', 'score': 0.9997506737709045},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9807196259498596}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9566920399665833},
  {'label': 'POSITIVE', 'score': 0.9392716884613037},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9993440508842468},
  {'label': 'POSITIVE', 'score': 0.9767380356788635},
  {'label': 'POSITIVE', 'score': 0.9954968690872192},
  {'label': 'POSITIVE', 'score': 0.9938702583312988},
  {'label': '

In [120]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestops-gme-preliminary-results-highlight-160700244.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-stock-trading-153229310.html',
  'https://finance.yahoo.com/news/gamestop-releases-q1-results-ahead-111604723.html',
  'https://finance.yahoo.com/news/gamestop-gme-down-13-month-124300979.html',
  'https://finance.yahoo.com/news/gamestop-gme-rises-higher-market-214519002.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/gamestop-should-ditch-retail-and-become-a-holding-company-like-warren-buffetts-berkshire-hathaway-133106692.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-shares-trading-162132483.html',
  'https://finance.yahoo.com/news/gamestop-stock-nyse-gme-game-000228694.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-shares-trading-151315370.html',
  'https://finance.yahoo.com/news/investors-heavily-search-gamestop-corp-130015751.html']

In [121]:
range(len(summaries['GME']))

range(0, 11)

In [122]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [123]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/gamestops-gme-preliminary-results-highlight-160700244.html'],
 ['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/why-gamestop-gme-stock-trading-153229310.html'],
 ['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/gamestop-releases-q1-results-ahead-111604723.html'],
 ['GME',
  'The stock is trading well below its 52-week high of $64.83. This sharp drop has left many investors questioning the future of GameStop',
  'NEGATIVE',
  0.998848557472229,
  'https://finance.yahoo.com/news/gamestop-gme-down-13-month-124300979.html'],
 ['GME',
  'Shares of the video game retailer have been moving higher in recent days.',
  'POSITIVE',
  0.9962789416313171,
  'https://finance.yahoo.com/news/gamestop-gme-rises-higher

In [124]:
final_output.insert(0,['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [125]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/gamestops-gme-preliminary-results-highlight-160700244.html'],
 ['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/why-gamestop-gme-stock-trading-153229310.html'],
 ['GME',
  'We will update this page as soon as possible.',
  'POSITIVE',
  0.9566920399665833,
  'https://finance.yahoo.com/news/gamestop-releases-q1-results-ahead-111604723.html'],
 ['GME',
  'The stock is trading well below its 52-week high of $64.83. This sharp drop has left many investors questioning the future of GameStop',
  'NEGATIVE',
  0.998848557472229,
  'https://finance.yahoo.com/news/gamestop-gme-down-13-month-124300979.html'],
 ['GME',
  'Shares of the video game retailer have been moving higher in recent days.',
  'POSITIVE',
  0.9962789416313171,
  'ht

In [126]:
import csv
with open('assetsummaries.csv', mode='w', newline='', encoding='utf-8') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)