# This is the notebook version of the `get_data` script

# Imports

In [17]:
import feedparser
import json
import os
import xml.etree.ElementTree as ET

In [18]:
feed_url = 'https://pythoninvest.com/rss-feed-612566707351.xml'
output_file_path = 'data/input_news_feed.json'

In [19]:
feed = feedparser.parse(feed_url)

In [20]:
#type(feed), feed # feed a dictionary

`feed_dict_pretty.txt` is the prettified version of the above file for reference.

In [21]:
#feed['entries'][0]

In [22]:
#feed['entries'][0]['turbo_content']

In [23]:
#feed['entries'][0]['enclosures']

In [24]:
#feed['entries'][0].keys()

enclosures not presneet in the dict still it is accessible from `feed['entries'][0]['enclosures']` ?

In [25]:
# json file which would be input
rss_feed = {
        "meta": {
            "title": feed.feed.title,
            "link": feed.feed.link,
            "description": feed.feed.description,
            "language": feed.feed.language
        },
        "items": []
    }

In [26]:
type(rss_feed), rss_feed

(dict,
 {'meta': {'title': 'FinNews',
   'link': 'https://pythoninvest.com',
   'description': 'Weekly News Digest with the help of ChatGPT',
   'language': 'en'},
  'items': []})

In [27]:
entry_element = feed['entries'][0]

In [28]:
# entry_element['turbo_content']

In [29]:
# If the RSS feed format would be same , then the input to the method would be an element of the python dict `feed`. 
# So input to this method would be an element which is of the type `dict`.
def extract_turbo_content(entry_element):
    """
    Extracts turbo:content from the raw XML
    """
    try:
        if entry_element:  
            content = entry_element.get('turbo_content')  
        return content if content else None

    except (AttributeError, KeyError, TypeError) as e:
        # AttributeError: If entry_element doesn't support dictionary operations
        # KeyError: If 'turbo_content' doesn't exist
        # TypeError: If entry_element is not of the expected type
        print(f"Error extracting turbo content: {str(e)}")

In [30]:
turbo_content = extract_turbo_content(entry_element) 
turbo_content[:100]

'<header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131'

In [31]:
for entry in feed.entries:
    # Extract turbo content if available
    turbo_content = extract_turbo_content(entry)

    item = {
        "title": entry.title,
        "link": entry.link,
        "pubDate": entry.published,
        "author": entry.author if "author" in entry else None,
        "category": entry.get("category", None),
        "description": entry.description,
        "content": turbo_content,  # Here we add the content field from turbo:content
        "enclosure": {
            "url": entry.enclosures[0].href,
            "type": entry.enclosures[0].type
        } if entry.enclosures else None
    }
    rss_feed["items"].append(item)
    break

In [32]:
rss_feed # contains the 1st entry

{'meta': {'title': 'FinNews',
  'link': 'https://pythoninvest.com',
  'description': 'Weekly News Digest with the help of ChatGPT',
  'language': 'en'},
 'items': [{'title': 'Week 17-24 July 2023',
   'link': 'https://pythoninvest.com/tpost/yk09rupzv1-week-17-24-july-2023',
   'pubDate': 'Mon, 24 Jul 2023 20:00:00 +0300',
   'author': 'Ivan Brigida',
   'category': 'FinNews',
   'description': 'Market summary for the week 17-24 July',
   'enclosure': {'url': 'https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg',
    'type': 'image/svg+xml'}}]}

Tentative table format:
``` python
page_published, ..., page_details, start_date, end_date, type, model, ticker, count_news, growth, text
      ...               ..            2024-09-24 , 2024-09-30, 'individual', --, NVDA, 51, +0.03, <summary>
       ..              ..             2024-09-29 , 2024-09-30, 'market_1day', gpt3.5, -- , 100, --- , <summary>
       ..              ..             2024-09-29 , 2024-09-30, 'market_1day', gpt4, -- , 100, --- , <summary>

              ..              ..       2024-09-09 , 2024-09-30, 'market_1week', gpt4-- , 1000, --- , <summary>
```

In [51]:
from bs4 import BeautifulSoup
import re

In [52]:
rss_item = rss_feed['items'][0] # selecting the 1st element
rss_item

{'title': 'Week 17-24 July 2023',
 'link': 'https://pythoninvest.com/tpost/yk09rupzv1-week-17-24-july-2023',
 'pubDate': 'Mon, 24 Jul 2023 20:00:00 +0300',
 'author': 'Ivan Brigida',
 'category': 'FinNews',
 'description': 'Market summary for the week 17-24 July',
 'enclosure': {'url': 'https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg',
  'type': 'image/svg+xml'}}

In [53]:
html_content_str = rss_item['content']
html_content_str[:250]

'<header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndicati'

In [54]:
soup = BeautifulSoup(html_content_str, 'html.parser')

In [21]:
soup.find('strong', )

<strong>DISCLAIMER</strong>

In [22]:
soup.find('strong', string=re.compile(r'.*NEWS SUMMARY')).text.split()[0]

'INDIVIDUAL'

In [23]:
# type
summary_type = soup.find('strong', string=re.compile(r'.*NEWS SUMMARY')).text.split()[0]
summary_type

'INDIVIDUAL'

In [24]:
test_summary_type = 'MARKET'

In [25]:
soup.find('strong', string=re.compile(r'.*NEWS SUMMARY'))

<strong>INDIVIDUAL NEWS SUMMARY</strong>

In [26]:
soup.find('strong', string=re.compile(r'Start date.*End date'))

<strong>Start date for the articles: 2023-07-17; End date for the articles: ' '2023-07-24 '</strong>

In [27]:
date_text = soup.find('strong', string=re.compile(r'Start date.*End date')).text
start_date, end_date = re.findall(r'\d{4}-\d{2}-\d{2}', date_text)
start_date, end_date

('2023-07-17', '2023-07-24')

In [28]:
!pip list | grep beautiful

beautifulsoup4            4.12.3


In [29]:
"""
ticker heading
"""
soup.find('strong', string=re.compile(r'NEWS SUMMARY for.*'))

<strong><em>NEWS SUMMARY for ('TSLA', 42), which changed on 2.43% last trading day: </em></strong>

In [30]:
ticker_heading_strong_tag = soup.find('strong', string=re.compile(r'NEWS SUMMARY for.*'))
text_content = ticker_heading_strong_tag.text

In [31]:
pattern = r"NEWS SUMMARY for \('(\w+)', (\d+)\), which changed on ([-\d.]+)% last trading day:"
match = re.search(pattern, text_content)

In [32]:
match

<re.Match object; span=(0, 71), match="NEWS SUMMARY for ('TSLA', 42), which changed on 2>

In [33]:
if match:
    ticker = match.group(1)
    news_count = int(match.group(2))
    growth = float(match.group(3))
        
    print({
        'ticker': ticker,
        'news_count': count,
        'growth': percentage
    })

NameError: name 'count' is not defined

In [None]:
start_date, end_date, summary_type,ticker, news_count, growth,

In [None]:
model # only present when the type is MARKET.
text # how to get the text for the news? it doesn't have any common structure to work with.

In [55]:
html_content_str[:300]

'<header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845'

In [56]:
soup = BeautifulSoup(html_content_str, 'html.parser')
data_list = []

1. Check the prettified bversion of the html data
2. get the individual components one by one
3. use the `find_all_next()` and `find_next()`
4. then match the regex if needed to get the info as required
    1. re-use the code below to understnad what needs to be done.

In [57]:
# IN THE HTML FORMAT
print(soup.prettify())

<header>
 <h1>
  Week 17-24 July 2023
 </h1>
</header>
<figure>
 <img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/>
</figure>
<div class="t-redactor__embedcode">
 <script async="" crossorigin="anonymous" src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845276189467216">
 </script>
</div>
<div class="t-redactor__text">
 <strong>
  DISCLAIMER
 </strong>
 <br/>
 The content provided below was generated by AI (OpenAI's ChatGPT) using titles and descriptions from a selection of 5000 published financial news articles. The information presented should be used for informational purposes only and does not constitute financial advice or investment recommendations. We advise readers to conduct their own research and consult with a qualified financial advisor before making any investment decisions. The AI-generated content may not reflect the most current market conditions or developments and should be considered

In [72]:
# EXtracting the Individual News
for strong_tag in soup.find_all('strong'):
    print(f'----------START OF {strong_tag}---------------------')
    print(f"strong_tag\t\t\t {strong_tag}")
    print(f"strong_tag.em\t\t\t {strong_tag.em}")
    print(f"strong_tag.next_sibling\t\t {strong_tag.next_sibling}")
    print(f"strong_tag.next_sibling.next_sibling\t\t {strong_tag.next_sibling.next_sibling}") # this gets the summary text for individual news
    print(f"strong_tag.next_sibling.next_sibling.next_sibling\t\t {strong_tag.next_sibling.next_sibling.next_sibling}") # extracts the text for the multiple tickers
    # the problem now is that we are not able to extract all the market news under the strong tag.
    # the solution would be to use the div class tag for extracting the market news that is required.
        # we can use the code ''' for div in soup.find_all('div', class_='t-redactor__text'): ''' and modify the same to get the required results
    print('----------END OF ONE STRONG TAG-------------------------\n\n')

----------START OF <strong>DISCLAIMER</strong>---------------------
strong_tag			 <strong>DISCLAIMER</strong>
strong_tag.em			 None
strong_tag.next_sibling		 <br/>
strong_tag.next_sibling.next_sibling		 The content provided below was generated by AI (OpenAI's ChatGPT) using titles and descriptions from a selection of 5000 published financial news articles. The information presented should be used for informational purposes only and does not constitute financial advice or investment recommendations. We advise readers to conduct their own research and consult with a qualified financial advisor before making any investment decisions. The AI-generated content may not reflect the most current market conditions or developments and should be considered as a general summary of the selected news articles.
strong_tag.next_sibling.next_sibling.next_sibling		 <br/>
----------END OF ONE STRONG TAG-------------------------


----------START OF <strong>INDIVIDUAL NEWS SUMMARY</strong>----------------

In [63]:
for strong_tag in soup.find_all('strong'):
    em_tag = strong_tag.find('em')
    if em_tag:
        print('----------START OF ONE STRONG TAG---------------------')
        print(em_tag.text.strip())
        print('----------END OF ONE STRONG TAG-------------------------\n\n')
    
    # Check for the next sibling that is an <em> tag
    next_em = strong_tag.find_next_sibling('em')
    if next_em:
        print('----------START OF FOLLOWING EM TAG---------------------')
        print(next_em.text.strip())
        print('----------END OF FOLLOWING EM TAG-------------------------\n\n')

----------START OF FOLLOWING EM TAG---------------------
Tesla (TSLA) had a mixed week, with billionaire investor Chamath Palihapitiya believing the company is on the verge of a major breakthrough, while investor David Trainer argues that Tesla is overvalued. Tesla's second-quarter earnings report showed a rise in sales but highlighted concerns about profitability, causing a drop in the stock price. Microsoft (MSFT) partnered with Birlasoft to establish a Generative AI Centre of Excellence, with Morgan Stanley maintaining an Overweight rating on Microsoft. Warren Buffett's investment in Apple (AAPL) has resulted in Berkshire Hathaway's stake in the company ballooning to nearly $180 billion, but concerns remain around weak demand and a potential delay in the next iPhone launch. Amazon (AMZN) is planning to provide high-speed internet worldwide through Project Kuiper, and its stock may be positively impacted. Netflix (NFLX) reported mixed quarterly results, with a beat in subscriber esti

In [52]:
for div in soup.find_all('div', class_='t-redactor__text'):
    # if 'DISCLAIMER' in div.text: # The DISCLIAMER and individual news are in the same div tag in september version
    #         continue
    data = {
            "start_date": "",
            "end_date": "",
            "type": "",
            "model": "",
            "ticker": "",
            "count_news": "",
            "growth": "",
            "text": ""
        }
    # Extract summary type and dates
    print(div.text)
    summary_match = re.search(r'(INDIVIDUAL NEWS SUMMARY|MARKET NEWS SUMMARY)', div.text) # old XML file format
    # summary_match = re.search(r'(INDIVIDUAL TICKERS NEWS SUMMARY|MARKET NEWS SUMMARY)', div.text) # new XML file format
    if summary_match:
        data['type'] = summary_match.group(1)

    date_match = re.search(r'Start date for the articles: (\d{4}-\d{2}-\d{2}); End date for the articles: (\d{4}-\d{2}-\d{2})', div.text)
    if date_match:
        data['start_date'] = date_match.group(1)
        data['end_date'] = date_match.group(2)

    # Extract model information
    model_match = re.search(r'\[(GPT3\.5 model|GPT4 model) (\d+ (day|week)) summary\]', div.text)
    if model_match:
        data['model'] = f"{model_match.group(1)} {model_match.group(2)}"

    # Extract individual news summaries
    for strong_tag in div.find_all('strong'):
        news_match = re.search(r"NEWS SUMMARY for \('(\w+)', (\d+)\), which changed on ([-\d.]+)% last trading day:", strong_tag.text)
        if news_match:
            data['ticker'] = news_match.group(1)
            data['count_news'] = news_match.group(2)
            data['growth'] = news_match.group(3)
            data['text'] = strong_tag.next_sibling.strip()
            print(data)
            data_list.append(data.copy())  # Add a copy of the dictionary to the list
        elif 'MARKET NEWS SUMMARY' in strong_tag.text:
            market_match = re.search(r"MARKET NEWS SUMMARY \('multiple_tickers', (\d+)\)", strong_tag.text)
            if market_match:
                data['ticker'] = 'multiple_tickers'
                data['count_news'] = market_match.group(1)
                data['text'] = strong_tag.find_next(string=True).strip()
                data_list.append(data.copy())  # Add a copy of the dictionary to the list

DISCLAIMERThe content provided below was generated by AI (OpenAI's ChatGPT) using titles and descriptions from a selection of 5000 published financial news articles. The information presented should be used for informational purposes only and does not constitute financial advice or investment recommendations. We advise readers to conduct their own research and consult with a qualified financial advisor before making any investment decisions. The AI-generated content may not reflect the most current market conditions or developments and should be considered as a general summary of the selected news articles.***************************************************************INDIVIDUAL NEWS SUMMARYStart date for the articles: 2023-07-17; End date for the articles: ' '2023-07-24 '***************************************************************TL;DR : Tesla (TSLA) had a mixed week, with billionaire investor Chamath Palihapitiya believing the company is on the verge of a major breakthrough, while

TypeError: 'NoneType' object is not callable

In [24]:
data_list

[{'start_date': '',
  'end_date': '',
  'type': 'MARKET NEWS SUMMARY',
  'model': '',
  'ticker': 'multiple_tickers',
  'count_news': '261',
  'growth': '',
  'text': "MARKET NEWS SUMMARY ('multiple_tickers', 261)"},
 {'start_date': '',
  'end_date': '',
  'type': 'MARKET NEWS SUMMARY',
  'model': '',
  'ticker': 'multiple_tickers',
  'count_news': '2108',
  'growth': '',
  'text': "MARKET NEWS SUMMARY ('multiple_tickers', 2108)"}]

# Ollama - overview

In [33]:
from openai import OpenAI

In [34]:
# OpenAI client setup with increased timeout
# Read on Ollama - OpenAI compatibility: https://github.com/ollama/ollama/blob/main/docs/openai.md
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
    timeout = 5 * 60.0  # Increase timeout to 3*60 seconds
)

MAX_RETRIES = 3
RETRY_DELAY = 5
BATCH_SIZE = 1

In [35]:
def llm(prompt, model, temperature=0.0, ):
    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=model,
                temperature=temperature,
                messages=[{"role": "user", "content": prompt}]
            )
            return response
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                raise e

In [36]:
!ollama list

NAME                                ID              SIZE      MODIFIED   
phi3.5:3.8b-mini-instruct-q3_K_M    0d7837063f04    2.0 GB    5 days ago    
phi3.5:latest                       61819fb370a3    2.2 GB    7 days ago    
qwen2.5:1.5b                        65ec06548149    986 MB    7 days ago    
llama3.2:latest                     a80c4f17acd5    2.0 GB    7 days ago    


In [37]:
model = "llama3.2:latest"
# model = "qwen2.5:1.5b"
# model = "phi3.5:latest"
# model = "phi3.5:3.8b-mini-instruct-q3_K_M"

In [86]:
# example calling a local LLM via OpenAI compatibility
res = llm(prompt = "Tell me a joke",model=model)

In [87]:
type(res)

openai.types.chat.chat_completion.ChatCompletion

In [88]:
print(res.choices[0].message.content)

Sure, here'm with one: Why don't secrets work in an orchestra? Because everyone plays their own tune! (This is based on the pun playing off "conductor" and "secret conductors.")

Remember that humor can be subjective; what I find funny might not resonate with someone else. But hopefully, this little jest brings a smile to your face as it does mine when shared among friends!


In [38]:
ind_prompt_template = """Expert Web Scraper.
HTML Content: {html_str}

Extract text AS IT IS from given HTML:
- Date ranges
- mentioned ticker 
- news count
- growth percentage
- news for the ticker

Format:
{{
  "content": [
    {{
      "type": "individual",
      "start_date": <start date for articles>,
      "end_date": <end date for articles>,
      "ticker": <ticker symbol from news>,
      "count": <articles count from news>,
      "growth": <growth %>
      "text": <news for the ticker from html>,
    }},
    // repeat for all news
  ]
}}

Constraints:
Return JSON only. 
"""

In [39]:
ind_test_html = """<header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845276189467216"\n     crossorigin="anonymous"></script></div><div class="t-redactor__text"><strong>DISCLAIMER</strong><br />The content provided below was generated by AI (OpenAI\'s ChatGPT) using titles and descriptions from a selection of 5000 published financial news articles. The information presented should be used for informational purposes only and does not constitute financial advice or investment recommendations. We advise readers to conduct their own research and consult with a qualified financial advisor before making any investment decisions. The AI-generated content may not reflect the most current market conditions or developments and should be considered as a general summary of the selected news articles.<br /><br />***************************************************************<br /><strong>INDIVIDUAL NEWS SUMMARY</strong><br /><strong>Start date for the articles: 2023-07-17; End date for the articles: \' \'2023-07-24 \'</strong><br />***************************************************************<br /><strong><em>TL;DR :</em></strong><em> Tesla (TSLA) had a mixed week, with billionaire investor Chamath Palihapitiya believing the company is on the verge of a major breakthrough, while investor David Trainer argues that Tesla is overvalued. Tesla\'s second-quarter earnings report showed a rise in sales but highlighted concerns about profitability, causing a drop in the stock price. Microsoft (MSFT) partnered with Birlasoft to establish a Generative AI Centre of Excellence, with Morgan Stanley maintaining an Overweight rating on Microsoft. Warren Buffett\'s investment in Apple (AAPL) has resulted in Berkshire Hathaway\'s stake in the company ballooning to nearly $180 billion, but concerns remain around weak demand and a potential delay in the next iPhone launch. Amazon (AMZN) is planning to provide high-speed internet worldwide through Project Kuiper, and its stock may be positively impacted. Netflix (NFLX) reported mixed quarterly results, with a beat in subscriber estimates but missed revenue expectations. Verizon (VZ) partnered with the VA Palo Alto Health Care System to create the first full spectrum 5G hospital. Carvana (CVNA) announced positive quarterly results, a favorable debt deal, and a debt restructuring deal with bondholders, resulting in a stock surge. The overall stock market rally is expected to continue as long as there is no sign of a recession, but there are concerns about the US economy\'s state and a possible recession by February. Various headlines also covered topics such as cryptocurrency, technology, politics, and the world economy.</em><br /><br /><strong><em>NEWS SUMMARY for (\'TSLA\', 42), which changed on 2.43% last trading day: </em></strong><br />Billionaire investor Chamath Palihapitiya believes that Tesla is experiencing its "iPhone moment" and is on the verge of a major breakthrough. Meanwhile, investor David Trainer argues that Tesla is overvalued by over 1,000% due to its "disconnected" fundamentals. Tesla\'s second-quarter earnings report showed a rise in sales but also highlighted concerns about profitability and margins. Tesla\'s stock price dropped around 9% in response to the report. Despite mixed opinions, some analysts remain bullish on Tesla\'s future growth potential.<br /><br /><br /><strong><em>NEWS SUMMARY for (\'MSFT\', 11), which changed on 0.06% last trading day:</em></strong> <br />Indian company Birlasoft has partnered with Microsoft to establish a Generative AI Centre of Excellence. The aim is to accelerate value creation and foster innovation in the adoption of Generative AI, providing enterprise solutions across industries. The center will bring together Birlasoft\'s industry expertise and Microsoft Azure OpenAI Service to facilitate research, training, and collaboration. Morgan Stanley analyst Keith Weiss maintains an Overweight rating on Microsoft and believes the company has strong potential in the Generative AI space. <br /><br /><strong><em>NEWS SUMMARY for (\'AAPL\', 11), which changed on 0.51% last trading day:</em> </strong><br />Warren Buffett\'s investment in Apple has resulted in Berkshire Hathaway\'s stake in the company ballooning to nearly $180 billion, with the stock price surging nearly 50%. The rise in Apple\'s stock price is partly attributed to excitement around artificial intelligence. However, there are concerns of weak demand and a potential delay in the launch of the next iPhone, which could result in lower iPhone sales. Despite this, Morgan Stanley is bullish on India\'s growth potential for Apple and expects strong revenue and user growth in the country. <br /><br /><strong><em>NEWS SUMMARY for (\'AMZN\', 11), which changed on -0.94% last trading day: </em></strong><br />Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company\'s stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including "The Lost Flowers of Alice Hart" and "Cocaine Bear." A Truist analyst predicts robust revenue for Amazon, driven by strong e-commerce demand and cost optimization. However, Amazon is facing challenges regarding warehouse safety as hearings on OSHA violations begin. <br /><br /><strong><em>NEWS SUMMARY for (\'NFLX\', 19), which changed on -0.23% last trading day: </em></strong><br />Netflix\'s stock has been fluctuating after the company reported mixed quarterly results. While it beat subscriber estimates with 5.9 million paid net adds, its Q2 revenue of $8.19 billion missed expectations. However, some analysts remain optimistic about Netflix\'s future growth and expect it to rebound. The company is also rolling out paid sharing to more countries and has dropped its cheapest streaming plan without ads in the US and UK to push users towards its ad-supported service. Despite these developments, some caution that the stock may be overvalued. <br /><br /><strong><em>NEWS SUMMARY for (\'VZ\', 13), which changed on 1.39% last trading day: </em></strong><br />Verizon Business and the VA Palo Alto Health Care System have partnered to deploy a full spectrum private 5G network, creating the first full spectrum 5G hospital. This network will enable unprecedented care for veterans and improve healthcare services. Verizon Communications will report second-quarter 2023 earnings on July 25. Verizon is also continuing to upgrade its network in various locations, including Kentucky, Cleveland, and Daytona. Despite a recent plunge in stock value, Verizon\'s high dividend yield and potential for capital appreciation make it an attractive investment. <br /><br /><strong><em>NEWS SUMMARY for (\'CVNA\', 21), which changed on -2.46% last trading day: </em></strong><br />Carvana, an online used car dealer, announced major updates that could have significant implications for investors. The company reported solid quarterly results, a favorable debt deal, and newly enabled access to equity capital, reducing liquidity risks. Despite a recent downgrade from RBC Capital due to concerns over margins and unit economics, other analysts have maintained positive outlooks for the stock. Carvana\'s stock soared nearly 50% after its second-quarter earnings report beat expectations. The company also announced a debt restructuring deal with bondholders, expected to improve liquidity. <br /><br /><strong><em>NEWS SUMMARY for (\'DJIA\', 8), which changed on -0.58% last trading day: </em></strong><br />The stock market rally is expected to continue as long as there is no sign of a recession, according to Steve Eisman of Neuberger Berman. However, experts caution that transportation stocks should not be overestimated as an indicator of the overall economy. US bank lending has fallen slightly, while housing starts and retail sales have also shown weakness. Overall, the US economy is in a precarious state with concerns of a possible recession by February. <br /><br /><strong><em>NEWS SUMMARY for (\'SPY\', 7), which changed on 0.44% last trading day: </em></strong><br />The news includes various headlines related to the stock market, cryptocurrency, technology, politics, and the world economy. Some notable points include Cathie Wood\'s flagship ETF exiting Alibaba and other stocks, Elon Musk\'s Tesla reportedly removing Bitcoin as a payment option, and a prediction by Tom Lee that the S&amp;P 500 could hit 5,000 in 5 months. Other headlines discuss investor sentiment improving, regulatory actions in the cryptocurrency sector, and political developments in the US and world politics.<br /><br /></div>"""

In [42]:
prompt = ind_prompt_template.format(html_str=ind_test_html) # format(name_in_strin = variable_name)

# print(prompt)

Expert Web Scraper.
HTML Content: <header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845276189467216"
     crossorigin="anonymous"></script></div><div class="t-redactor__text"><strong>DISCLAIMER</strong><br />The content provided below was generated by AI (OpenAI's ChatGPT) using titles and descriptions from a selection of 5000 published financial news articles. The information presented should be used for informational purposes only and does not constitute financial advice or investment recommendations. We advise readers to conduct their own research and consult with a qualified financial advisor before making any investment decisions. The AI-generated content may not reflect the most current market conditions or developments and should be con

In [43]:
len(prompt)

9514

In [26]:
%%time
#Let's try it locally
scraped_data = llm(prompt = prompt, model=model)

CPU times: user 31.7 ms, sys: 12.2 ms, total: 43.9 ms
Wall time: 3min 9s


In [27]:
type(scraped_data)

openai.types.chat.chat_completion.ChatCompletion

In [36]:
# Result: 
scraped_data.choices[0].message.content

'{\n  "content": [\n    {\n      "type": "individual",\n      "start_date": null,\n      "end_date": null,\n      "ticker": "AMZN",\n      "count": 11,\n      "growth": "-0.94%",\n      "text": "Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company\'s stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including \\"The Lost Flowers of Alice Hart\\" and \\"Cocaine Bear.\\" A Truist analyst predicts robust revenue for Amazon, driven by strong e-commerce demand and cost optimization. However, Amazon is facing challenges regarding warehouse safety as hearings on OSHA violations begin."\n    },\n    {\n      "type": "individual",\n      "start_date": null,\n      "end_date": null,\n      "ticker": "NFLX",\n      "count": 19,\n      "growth": "-0.23%",\n      "text": "Netfl

In [29]:
print(scraped_data.choices[0].message.content)

{
  "content": [
    {
      "type": "individual",
      "start_date": null,
      "end_date": null,
      "ticker": "AMZN",
      "count": 11,
      "growth": "-0.94%",
      "text": "Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company's stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including \"The Lost Flowers of Alice Hart\" and \"Cocaine Bear.\" A Truist analyst predicts robust revenue for Amazon, driven by strong e-commerce demand and cost optimization. However, Amazon is facing challenges regarding warehouse safety as hearings on OSHA violations begin."
    },
    {
      "type": "individual",
      "start_date": null,
      "end_date": null,
      "ticker": "NFLX",
      "count": 19,
      "growth": "-0.23%",
      "text": "Netflix's stock has been fluc

In [47]:
def save_as_json_file(filename, json_string):
    json_data = json.loads(json_string)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=4)
        print(f"{filename} save in the current dir")

In [48]:
filename = 'indi_news_response.json'
json_string = scraped_data.choices[0].message.content

In [49]:
save_as_json_file(filename, json_string)

indi_news_response.json save in the current dir


Observations for individual news extractions:

Correctly working:
1. the data is extracted in the required format
2. the tickers are also extracted properly.
3. the sentences for the respective tickers are also extracted

NOT WORKING Properly:
1. the start and end dates are not extracted
2. all the tickers are not extracted like MSFT and AMZN, only some.
    1. maybe the prompt needed to be changed.
    2. 

## Market_News


In [44]:
market_prompt_template="""Act as an Expert Web Scraper.
HTML Content: {html_str}
HTML Content format:
[<model_name> <period> summary] MARKET NEWS SUMMARY ('multiple_tickers', <news_count> ) -- i.e. <news_count> news summary for the last 24 hours before <end_date> UTC time:
****************************
<news_summary>

Extract text AS IT IS from given HTML:
- model name
- period for summary 
- news count
- market news summary


Output JSON format:
{{
  "content": [
    {{
      "type": "market_"+<period>,
      "end_date": <end_date>,
      "start_date": <24 hours before end_date>,
      "ticker": "multiple_tickers",
      "count": <news_count>,
      "model": <model_name>
      "text": <news_summary>,
    }},
  ]
}}

Constraints:
Just return the JSON Output only.
"""

In [45]:
mark_1day__test_html = """<div class="t-redactor__text">***************************************************************<br /><strong>[</strong><strong style="color: rgb(120, 110, 11);">GPT3.5 model 1 day summary</strong><strong>] MARKET NEWS SUMMARY ('multiple_tickers', 84) -- i.e. 84 news summary for the last 24 hours before 07/10/2024 20:35 UTC time:</strong><br />***************************************************************<br /><br />Market expectations:<br />- Vanguard ETFs such as VIG and VYM are seen as providing stability in volatile markets, likely indicating a cautious sentiment among investors.<br />- The rise in Chinese stocks following government stimulus measures might have shaped expectations of potential entry points for investors, but technical indicators signaling overbought conditions suggest a more cautious outlook.<br />- The class action lawsuit against AMMO, Inc. (POWW) might have dampened market sentiment toward the company, impacting expectations for its future performance.<br /><br />Big trends:<br />- The global fertilizers market, commercial lawn mower market, and network attached storage market are projected to grow significantly, indicating a growing demand for these products and services.<br />- The U.S. multiomics market and the global in-vitro diagnostics market are expected to grow, reflecting a trend in the increasing prevalence of chronic and infectious diseases, as well as technological advancements in the healthcare industry.<br />- The growing interest in artificial intelligence (AI) is evident from the expected significant growth in the AI in diagnostics market and the European Generative AI market.<br />- The electric boats market is anticipated to grow rapidly as a result of increasing environmental regulations and technological advancements.<br /><br />Sentiment:<br />- There are varying sentiments towards specific stocks and industries, with some, like Rent the Runway, Rivian, and Exelixis, receiving analyst price targets suggesting significant upside, and others, like Trump Media and AMMO, Inc., facing class action lawsuits leading to increased investor caution.<br />- Concerns about inflationary pressure and potential impacts of Hurricane Milton on insurance stocks suggest a more cautious sentiment towards these sectors.<br />- Analysts are bullish on high-yield dividend stocks such as Chevron, Kraft Heinz, and Truist Financial amid dropping interest rates, indicating a positive sentiment toward these companies.</div>"""

In [46]:
market_prompt = market_prompt_template.format(html_str=mark_1day__test_html) # format(name_in_strin = variable_name)

In [47]:
print(market_prompt)

Act as an Expert Web Scraper.
HTML Content: <div class="t-redactor__text">***************************************************************<br /><strong>[</strong><strong style="color: rgb(120, 110, 11);">GPT3.5 model 1 day summary</strong><strong>] MARKET NEWS SUMMARY ('multiple_tickers', 84) -- i.e. 84 news summary for the last 24 hours before 07/10/2024 20:35 UTC time:</strong><br />***************************************************************<br /><br />Market expectations:<br />- Vanguard ETFs such as VIG and VYM are seen as providing stability in volatile markets, likely indicating a cautious sentiment among investors.<br />- The rise in Chinese stocks following government stimulus measures might have shaped expectations of potential entry points for investors, but technical indicators signaling overbought conditions suggest a more cautious outlook.<br />- The class action lawsuit against AMMO, Inc. (POWW) might have dampened market sentiment toward the company, impacting expecta

In [48]:
len(market_prompt)

3216

In [49]:
model

'llama3.2:latest'

In [95]:
%%time
#Let's try it locally
market_scraped_data = llm(prompt = market_prompt, model=model)

CPU times: user 17.1 ms, sys: 9.65 ms, total: 26.8 ms
Wall time: 1min 57s


In [96]:
# Result: 
market_scraped_data.choices[0].message.content

'{\n  "content": [\n    {\n      "type": "market_1 day",\n      "end_date": "07/10/2024 20:35 UTC",\n      "start_date": "07/09/2024 16:35 UTC",\n      "ticker": "multiple_tickers",\n      "count": 84,\n      "model": "GPT3.5 model",\n      "text": "***************************************************************<br /><strong>[</strong><strong style=\\"color: rgb(120, 110, 11);\\">GPT3.5 model 1 day summary</strong><strong>] MARKET NEWS SUMMARY (\'multiple_tickers\', 84) -- i.e. 84 news summary for the last 24 hours before 07/10/2024 20:35 UTC time:</strong><br />***************************************************************<br /><br />Market expectations:<br />- Vanguard ETFs such as VIG and VYM are seen as providing stability in volatile markets, likely indicating a cautious sentiment among investors.<br />- The rise in Chinese stocks following government stimulus measures might have shaped expectations of potential entry points for investors, but technical indicators signaling overb

In [97]:
print(market_scraped_data.choices[0].message.content)

{
  "content": [
    {
      "type": "market_1 day",
      "end_date": "07/10/2024 20:35 UTC",
      "start_date": "07/09/2024 16:35 UTC",
      "ticker": "multiple_tickers",
      "count": 84,
      "model": "GPT3.5 model",
      "text": "***************************************************************<br /><strong>[</strong><strong style=\"color: rgb(120, 110, 11);\">GPT3.5 model 1 day summary</strong><strong>] MARKET NEWS SUMMARY ('multiple_tickers', 84) -- i.e. 84 news summary for the last 24 hours before 07/10/2024 20:35 UTC time:</strong><br />***************************************************************<br /><br />Market expectations:<br />- Vanguard ETFs such as VIG and VYM are seen as providing stability in volatile markets, likely indicating a cautious sentiment among investors.<br />- The rise in Chinese stocks following government stimulus measures might have shaped expectations of potential entry points for investors, but technical indicators signaling overbought conditio

### Saving File

In [98]:
filename = 'market_1d_3.5_news_response.json'
json_string = market_scraped_data.choices[0].message.content

save_as_json_file(filename, json_string)

market_1d_3.5_news_response.json save in the current dir


## Combining into one content element

In [53]:
ind_news_json_fn = "indi_news_response.json"
market_news_json_fn = "market_1d_3.5_news_response.json"

In [55]:
with open(ind_news_json_fn, 'r') as f1, open(market_news_json_fn, 'r') as f2:
    ind_news_json = json.load(f1)
    market_news_json = json.load(f2)

In [59]:
type(ind_news_json), type(market_news_json)

(dict, dict)

In [63]:
ind_news_json, market_news_json

({'content': [{'type': 'individual',
    'start_date': None,
    'end_date': None,
    'ticker': 'AMZN',
    'count': 11,
    'growth': '-0.94%',
    'text': 'Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company\'s stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including "The Lost Flowers of Alice Hart" and "Cocaine Bear." A Truist analyst predicts robust revenue for Amazon, driven by strong e-commerce demand and cost optimization. However, Amazon is facing challenges regarding warehouse safety as hearings on OSHA violations begin.'},
   {'type': 'individual',
    'start_date': None,
    'end_date': None,
    'ticker': 'NFLX',
    'count': 19,
    'growth': '-0.23%',
    'text': "Netflix's stock has been fluctuating after the company reported mixed quarterly res

In [64]:
dict_a = {'content':[
    {'type': 'individual',
   'start_date': None,
   'end_date': None,
   'ticker': 'SPY',
   'count': 7,
   'growth': '0.44%',
   'text': 'texta'}
]}
dict_b = {'content':[
    {'type': 'market_1 day',
    'end_date': '07/10/2024 20:35 UTC',
    'start_date': '07/09/2024 16:35 UTC',
    'ticker': 'multiple_tickers',
    'count': 84,
    'model': 'GPT3.5 model',
    'text': 'textb'}
]}

In [65]:
combined_content = dict_a['content'] + dict_b['content']

In [66]:
combined_dict = {'content': combined_content}

In [67]:
combined_dict

{'content': [{'type': 'individual',
   'start_date': None,
   'end_date': None,
   'ticker': 'SPY',
   'count': 7,
   'growth': '0.44%',
   'text': 'texta'},
  {'type': 'market_1 day',
   'end_date': '07/10/2024 20:35 UTC',
   'start_date': '07/09/2024 16:35 UTC',
   'ticker': 'multiple_tickers',
   'count': 84,
   'model': 'GPT3.5 model',
   'text': 'textb'}]}

In [69]:
combined_content = {'content':ind_news_json['content'] + market_news_json['content']}
combined_content

{'content': [{'type': 'individual',
   'start_date': None,
   'end_date': None,
   'ticker': 'AMZN',
   'count': 11,
   'growth': '-0.94%',
   'text': 'Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company\'s stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including "The Lost Flowers of Alice Hart" and "Cocaine Bear." A Truist analyst predicts robust revenue for Amazon, driven by strong e-commerce demand and cost optimization. However, Amazon is facing challenges regarding warehouse safety as hearings on OSHA violations begin.'},
  {'type': 'individual',
   'start_date': None,
   'end_date': None,
   'ticker': 'NFLX',
   'count': 19,
   'growth': '-0.23%',
   'text': "Netflix's stock has been fluctuating after the company reported mixed quarterly results. While it

In [70]:
type(combined_content)

dict

In [71]:
type(rss_feed)

dict

In [73]:
new_rss_feed = rss_feed.copy()

In [77]:
new_rss_feed['items'][0]['content'] #0 should be replaced with i for looping through all the data in the items key
# the above would be the input to the llm



In [78]:
new_rss_feed['items'][0]['content'] = combined_content['content']
new_rss_feed

{'meta': {'title': 'FinNews',
  'link': 'https://pythoninvest.com',
  'description': 'Weekly News Digest with the help of ChatGPT',
  'language': 'en'},
 'items': [{'title': 'Week 17-24 July 2023',
   'link': 'https://pythoninvest.com/tpost/yk09rupzv1-week-17-24-july-2023',
   'pubDate': 'Mon, 24 Jul 2023 20:00:00 +0300',
   'author': 'Ivan Brigida',
   'category': 'FinNews',
   'description': 'Market summary for the week 17-24 July',
   'content': [{'type': 'individual',
     'start_date': None,
     'end_date': None,
     'ticker': 'AMZN',
     'count': 11,
     'growth': '-0.94%',
     'text': 'Amazon is planning to provide high-speed internet worldwide through its Project Kuiper. This initiative may have a positive impact on the company\'s stock. Users on Zacks.com have been closely following Amazon, indicating strong interest in the stock. Amazon Prime Video and Freevee will have new content available in August 2023, including "The Lost Flowers of Alice Hart" and "Cocaine Bear." A

## Data Frame

# Modular methods

In [10]:
import feedparser
import json
import os
import xml.etree.ElementTree as ET

In [17]:
# If the RSS feed format would be same , then the input to the method would be an element of the python dict `feed`. 
# So input to this method would be an element which is of the type `dict`.
def extract_turbo_content(entry_element):
    """
    Extracts turbo:content from the raw XML
    """
    try:
        if entry_element:  
            content = entry_element.get('turbo_content')  
        return content if content else None

    except (AttributeError, KeyError, TypeError) as e:
        # AttributeError: If entry_element doesn't support dictionary operations
        # KeyError: If 'turbo_content' doesn't exist
        # TypeError: If entry_element is not of the expected type
        print(f"Error extracting turbo content: {str(e)}")

In [19]:
def parse_rss_to_json(feed_url, output_file_path):
    # Parse the RSS feed
    feed = feedparser.parse(feed_url)

    # Structure the feed data into JSON format
    rss_feed = {
        "meta": {
            "title": feed.feed.title,
            "link": feed.feed.link,
            "description": feed.feed.description,
            "language": feed.feed.language
        },
        "items": []
    }

    # Loop through each item in the feed and add it to the JSON
    for entry in feed.entries:
        # Extract turbo content if available
        turbo_content = extract_turbo_content(entry)
 
        item = {
            "title": entry.title,
            "link": entry.link,
            "pubDate": entry.published,
            "author": entry.author if "author" in entry else None,
            "category": entry.get("category", None),
            "description": entry.description,
            "content": turbo_content,  # Here we add the content field from turbo:content
            "enclosure": {
                "url": entry.enclosures[0].href,
                "type": entry.enclosures[0].type
            } if entry.enclosures else None
        }
        rss_feed["items"].append(item)

    # Convert the structured feed data to JSON string
    rss_feed_json = json.dumps(rss_feed, indent=4)

    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save the JSON string to a file
    with open(output_file_path, 'w') as json_file:
        json_file.write(rss_feed_json)

    print(f"RSS feed data saved to {output_file_path}")

In [20]:
if __name__ == "__main__":
    # RSS feed URL and output file path
    feed_url = 'https://pythoninvest.com/rss-feed-612566707351.xml'  # Fin news RSS
    output_file_path = '../data/input_news_feed.json' # folder relative to the current notebook
    
    # Parse and save the RSS feed to a JSON file
    parse_rss_to_json(feed_url, output_file_path)

RSS feed data saved to ../data/input_news_feed.json


# Improvments

1. Trying to make the HTML content produced in the upstream more uniform in nature. This will help the extracting the values more easier.
2. 