In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os

def get_news_articles(query):
    """Fetches URLs, titles, and published dates of news articles related to the given query using NewsAPI."""
    api_key = 'bcca32322826433a995e96e82cb950ae'  
    url = 'https://newsapi.org/v2/everything'
    parameters = {
        'q': query,
        'apiKey': api_key,
        'pageSize': 10,  
    }
  
    try:
        response = requests.get(url, params=parameters)
        response.raise_for_status()
        articles_data = [{
            'title': article['title'],
            'url': article['url'],
            'publishedAt': article['publishedAt'],
            'content': None  # Placeholder for content
        } for article in response.json().get('articles', [])]
        print(articles_data)
        return articles_data
    except requests.RequestException as e:
        print(f"Failed to retrieve news articles. Error: {e}")
        return []

def get_web_page_content(url):
    """Fetches content from the specified URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Failed to retrieve web page. Error: {e}")
        return None

def parse_article_content(html_content):
    """Parses the HTML content to extract content paragraphs."""
    soup = BeautifulSoup(html_content, 'html.parser')
    content_paragraphs = soup.find_all('p')
    content = '\n'.join(p.text.strip() for p in content_paragraphs)
    return content

def sanitize_filename(title):
    """Sanitizes the title to create a valid filename."""
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in title if c in valid_chars)
    filename = filename.replace(' ', '_')  # Replace spaces with underscores
    return filename

def save_article_json(article_data):
    """Saves the article data in a JSON format file named after the article's title."""
    filename = sanitize_filename(article_data['title']) + '.json'
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(article_data, file, ensure_ascii=False, indent=4)
        print(f"Article data saved to '{filename}'")
    except IOError as e:
        print(f"Failed to save article data. Error: {e}")

def process_articles(query):
    """Processes articles related to the given query and saves them as JSON files."""
    articles = get_news_articles(query)
    if not articles:
        print("No articles found or failed to retrieve articles.")
        return

    for article in articles:
        html_content = get_web_page_content(article['url'])
        if html_content:
            article['content'] = parse_article_content(html_content)
        else:
            article['content'] = "Failed to retrieve article content"
        save_article_json(article)

if __name__ == "__main__":
    import string
    process_articles("bitcoin")

[{'title': 'What’s Behind the Bitcoin Price Surge? Vibes, Mostly', 'url': 'https://www.wired.com/story/bitcoin-price-record-economics/', 'publishedAt': '2024-03-12T18:13:56Z', 'content': None}]
Article data saved to 'Whats_Behind_the_Bitcoin_Price_Surge_Vibes_Mostly.json'


In [None]:
{
	"brk_gen": {
		"brxName": "Summaraize",
		"brxId": "c87d8422-6554-4ffb-ba1e-50146a67c53a",
		"dependantBrxIds": {},
		"description": "hackathon",
		"prompt": {
			"prompt": {
				"system": "\n### input\nbrx{{image_var}}\n###\n "
			}
		},
		"processParams": {
			"processType": 7
		}
	},
	"brxFieldData": {
		"schemaFields": {
			"image_var": {
				"fieldValueDataType": "string"
			}
		},
		"brxName": "Summaraize",
		"brxId": "c87d8422-6554-4ffb-ba1e-50146a67c53a"
	}
}

In [2]:
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-small")

def summarize_text(text, max_length=100):
  """
  Summarizes a text using a lightweight LLM (t5-small).

  Args:
      text: The text to be summarized.
      max_length: The maximum length of the summary (default 100 words).

  Returns:
      A string containing the summarized text.
  """
  summary = summarizer(text, max_length=max_length, truncation=True)
  return summary[0]["summary_text"]

with open("Whats_Behind_the_Bitcoin_Price_Surge_Vibes_Mostly.json", "r") as f:
    captions_data = json.load(f)
# text = json.load('/content/Whats_Behind_the_Bitcoin_Price_Surge_Vibes_Mostly.json')
summary = summarize_text(captions_data['content'])

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates