In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import logging
from typing import List
from transformers import pipeline

In [4]:
class NewsArticle:
    def __init__(self, title: str, content: str, date: str, source: str, url: str):
        self.title = title
        self.content = content
        self.date = date
        self.source = source
        self.url = url

In [6]:
class FinancialNewsScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.sources = {
            'reuters': 'https://www.reuters.com/markets/companies',
            'marketwatch': 'https://www.marketwatch.com/markets',
            'investing': 'https://www.investing.com/news/stock-market-news'
        }
        self.sentiment_analyzer = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")

    def scrape_news(self, company: str, start_date: str, end_date: str) -> List[NewsArticle]:
        articles = []
        current_date = datetime.strptime(start_date, '%Y-%m-%d')
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
        
        while current_date <= end_date:
            for source, url in self.sources.items():
                try:
                    articles.extend(self._scrape_source(source, url, company, current_date))
                    time.sleep(2)  # Polite delay between sources
                except Exception as e:
                    logging.error(f"Error scraping {source}: {str(e)}")
            current_date += timedelta(days=1)
        return articles

    def _scrape_source(self, source: str, url: str, company: str, date: datetime) -> List[NewsArticle]:
        articles = []
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            if source == 'reuters':
                articles.extend(self._parse_reuters(soup, company, date))
            elif source == 'marketwatch':
                articles.extend(self._parse_marketwatch(soup, company, date))
            elif source == 'investing':
                articles.extend(self._parse_investing(soup, company, date))
                
        except Exception as e:
            logging.error(f"Error in _scrape_source for {source}: {str(e)}")
            
        return articles

    def _parse_reuters(self, soup: BeautifulSoup, company: str, date: datetime) -> List[NewsArticle]:
        articles = []
        for article in soup.find_all('article'):
            try:
                title = article.find('h3').text.strip()
                if company.lower() in title.lower():
                    link = article.find('a')['href']
                    article_content = self._get_article_content(f"https://reuters.com{link}")
                    articles.append(NewsArticle(title, article_content, date.strftime('%Y-%m-%d'), 'Reuters', link))
            except Exception as e:
                logging.error(f"Error parsing Reuters article: {str(e)}")
        return articles

    def _get_article_content(self, url: str) -> str:
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            return ' '.join([p.text.strip() for p in paragraphs])
        except Exception as e:
            logging.error(f"Error getting article content: {str(e)}")
            return ""

    def analyze_sentiment(self, articles: List[NewsArticle]) -> List[dict]:
        results = []
        for article in articles:
            sentiment = self.sentiment_analyzer(article.content)
            results.append({
                'title': article.title,
                'content': article.content,
                'date': article.date,
                'source': article.source,
                'url': article.url,
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            })
        return results

# Example usage
scraper = FinancialNewsScraper()
articles = scraper.scrape_news('Reliance', '2023-01-01', '2023-01-07')
sentiment_results = scraper.analyze_sentiment(articles)







pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]



tf_model.h5:   0%|          | 0.00/439M [00:00<?, ?B/s]



pytorch_model.bin:  72%|#######1  | 315M/439M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/439M [00:00<?, ?B/s]

ValueError: Could not load model yiyanghkust/finbert-tone with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForSequenceClassification'>, <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>, <class 'transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification'>). See the original errors:

while loading with AutoModelForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\pipelines\base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\models\auto\auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\modeling_utils.py", line 3805, in from_pretrained
    resolved_archive_file = cached_file(
                            ^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 860, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1009, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1543, in _download_to_tmp_and_move
    http_get(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 455, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device

while loading with TFAutoModelForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\pipelines\base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\models\auto\auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\modeling_tf_utils.py", line 2843, in from_pretrained
    resolved_archive_file = cached_file(
                            ^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 860, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1009, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1543, in _download_to_tmp_and_move
    http_get(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 455, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device

while loading with BertForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\pipelines\base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\modeling_utils.py", line 3805, in from_pretrained
    resolved_archive_file = cached_file(
                            ^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 860, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1009, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1543, in _download_to_tmp_and_move
    http_get(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 455, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device

while loading with TFBertForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\pipelines\base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\modeling_tf_utils.py", line 2843, in from_pretrained
    resolved_archive_file = cached_file(
                            ^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 860, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1009, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 1543, in _download_to_tmp_and_move
    http_get(
  File "C:\Users\OM\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py", line 455, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device


