In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time

# Parameters (this will be replaced by Flask)
category = 'business'  # Example default value

# Define the API key and endpoint
API_KEY = 'f77f6d51bb194c3fb05be0d5db7f8971' 
ENDPOINT = 'https://newsapi.org/v2/top-headlines'

def fetch_and_summarize_articles(category, api_key):
  """
  Fetches news articles for the given category, finds multiple sources, 
  and combines their text.

  Args:
    category: The news category to fetch articles for.
    api_key: The API key for the News API.

  Returns:
    A tuple containing:
      - A dictionary with 'headline', 'combined_text', and 'sources' if successful.
      - None if successful, otherwise an error message string.
  """
  params = {
      'category': category,
      'pageSize': 100,
      'apiKey': api_key,
  }
  response = requests.get(ENDPOINT, params=params)
  data = response.json()

  if response.status_code != 200 or data['status'] != 'ok':
      return None, f"Failed to fetch articles: {data.get('message', 'Unknown error')}"

  articles = data['articles']

  filtered_headlines = {}
  headline_count = 0 

  for article in articles:
    headline = article['title']
    sources = [article['source']['name']]  # Get source from current article

    # Search for additional sources using Yahoo Search
    preferred_sources = ["nytimes.com", "bbc.com", "theguardian.com", "reuters.com", "cnn.com", "foxnews.com", "news.google.com", "dailymail.co.uk", "usatoday.com", "indiatimes.com", "news18.com", "forbes.com", "apnews.com", "people.com", "india.com", "nbcnews.com", "nypost.com", "hindustantimes.com", "washingtonpost.com"] 

    for source in preferred_sources:
        try:
            time.sleep(2)  # Increased delay to avoid rate limit
            additional_sources = find_sources(headline, source) 
            if additional_sources: 
                sources.extend(additional_sources[:1])  # Add at most 1 source from each preferred source
                break  # Move to the next headline after finding a source

        except requests.exceptions.RequestException as e:
            print(f"Error fetching search results for {source} from Yahoo: {e}")
            continue  # Continue to the next source

    if headline not in filtered_headlines:
        filtered_headlines[headline] = {
            'articles': [article],
            'sources': sources,
        }
        headline_count += 1

    if headline_count >= 4:  # Limit to 2 headlines
        break

  if not filtered_headlines:
    return None, "No headlines found with articles from preferred sources."

  selected_headline = next(iter(filtered_headlines))
  selected_articles = filtered_headlines[selected_headline]['articles'][:3]
  news_texts = [article['content'] or article['description'] for article in selected_articles]
  combined_text = " ".join(news_texts)

  output = {
      "headline": selected_headline,
      "combined_text": combined_text,
      "sources": filtered_headlines[selected_headline]['sources'],
  }

  return output, None

def find_sources(headline, source):
  """
  Finds additional sources for the given headline using Yahoo Search.

  Args:
      headline: The headline to search for.
      source: The specific news source to search.

  Returns:
      A list of up to one source URL, or an empty list if no sources are found.
  """
  search_query = f'"{headline}" site:{source}'

  try:
    time.sleep(1)  # Introduce a delay to avoid rate limit
    response = requests.get(f"https://search.yahoo.com/search?p={search_query}") 
    response.raise_for_status()  # Raise an exception for bad status codes

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract URLs from search results
    urls = []
    for result in soup.find_all('a', href=re.compile(r'^https?://')):
      url = result['href']
      if url.startswith('http') and not url.startswith('http://search.yahoo.com'):
        urls.append(url)

    return urls[:1]  # Return at most one source URL

  except requests.exceptions.RequestException as e:
    print(f"Error fetching search results for {source} from Yahoo: {e}")
    return []

# Call the new function and handle the output
output, error = fetch_and_summarize_articles(category, API_KEY) 

if error:
  # Handle error
  print(f"Error: {error}") 
else:
  print(json.dumps(output))

{"headline": "Amazon protest expands in N.Y. as union hopes to disrupt holiday deliveries - The Washington Post", "combined_text": "NEW YORK Some Amazon workers led by the Teamsters union said they walked off the job in Staten Island early Saturday morning, joining the third day of a strike aimed at disrupting holiday deliveries \u2026 [+405 chars]", "sources": ["The Washington Post", "https://www.yahoo.com"]}
