In [None]:
!pip install newspaper3k
!pip install beautifulsoup4
!pip install lxml_html_clean


In [97]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

### GET ALL ARTICLES

In [121]:
from newspaper import Article
from bs4 import BeautifulSoup
import requests
import re

from newspaper import Article

def extract_article_content(url):
    """
    Extract just the main article content from a news website.
    
    Args:
        url (str): URL of the news article
        
    Returns:
        dict: Article title, text content, and publish date
    """
    try:
        # Initialize Article object
        article = Article(url)
        article.download()
        article.parse()
        
        # Get the main content
        content = {
            'title': article.title,
            'text': article.text,
            'publish_date': article.publish_date,
        }
        
        # Clean the text content
        content['text'] = re.sub(r'\n+', '\n', content['text'])  # Remove extra newlines
        content['text'] = re.sub(r'\s+', ' ', content['text'])   # Remove extra whitespace
        
        return content['text']
        
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None

In [122]:
from openai import OpenAI
import json
from typing import List, Dict, Optional

        
client = OpenAI()

def validate_article(content: Dict) -> Dict:
    """
    Validate if the content is a real article using ChatGPT.
    
    Args:
        content (dict): Dictionary containing article content
        
    Returns:
        dict: Original content with validation results added
    """

    print(content)
    # Prepare the prompt
    system_prompt = """Analyze the following text and determine if it's a real article or just website notices (like cookies, privacy policy, etc.).
    You need to base yourself on the full content of the article.
    Respond with a JSON object containing:
    1. "is_article": boolean (true if it's a real article)
    2. "confidence": float (0-1)
    3. "reason": string (brief explanation)
    Only respond with the JSON object, no other text."""

    user_prompt = f"""Title: {content.get('title', 'No title')}
    Content: {content.get('full_content', '')}"""

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1  # Low temperature for more consistent results
        )
        
        # Parse the response
        validation_result = json.loads(response.choices[0].message.content)
        
        # Add validation results to the original content
        content.update({
            'is_valid_article': validation_result['is_article'],
            'validation_confidence': validation_result['confidence'],
            'validation_reason': validation_result['reason']
        })
        
    except Exception as e:
        content.update({
            'is_valid_article': False,
            'validation_confidence': 0.0,
            'validation_reason': f"Error during validation: {str(e)}"
        })
        
    return content

def batch_validate_articles(articles: List[Dict], 
                            confidence_threshold: float = 0.8) -> Dict[str, List[Dict]]:
    """
    Validate multiple articles and separate them into valid and invalid.
    
    Args:
        articles (list): List of article dictionaries
        confidence_threshold (float): Minimum confidence to consider valid
        
    Returns:
        dict: Contains 'valid_articles' and 'invalid_articles' lists
    """
    valid_articles = []
    invalid_articles = []
    
    for article in articles:
        print(article)
        validated_article = validate_article(article)
        
        if (validated_article['is_valid_article'] and 
            validated_article['validation_confidence'] >= confidence_threshold):
            valid_articles.append(validated_article)
        else:
            invalid_articles.append(validated_article)
            
    return {
        'valid_articles': valid_articles,
        'invalid_articles': invalid_articles
    }

#### RSS

In [133]:
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional


def scrape_rss_feed(url: str,limit: Optional[int] = None) -> List[Dict]:
    """
    Scrape RSS feed and extract links using jjina reader API
    
    Args:
        url (str): URL of the RSS feed
        
    Returns:
        List[Dict]: List of articles with their links and metadata
    """
    try:
        # Fetch the RSS feed
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse XML content
        root = ET.fromstring(response.content)
        
        # Find all item elements
        items = root.findall(".//item")

        if limit is not None:
            items = items[:limit]
        
        # Extract article information
        articles = []
        for item in items:
            article = {
                'title': item.find('title').text if item.find('title') is not None else '',
                'link': item.find('link').text if item.find('link') is not None else '',
                #'full_content': jjina_reader(item.find('link').text) if item.find('link') is not None else '',
                'description': item.find('description').text if item.find('description') is not None else '',
                'pubDate': item.find('pubDate').text if item.find('pubDate') is not None else '',
                'creator': item.find('.//{http://purl.org/dc/elements/1.1/}creator').text 
                          if item.find('.//{http://purl.org/dc/elements/1.1/}creator') is not None else ''
            }
            articles.append(article)
            
        return articles
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching RSS feed: {e}")
        return []
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        return []

rss_url = "https://rss.app/feeds/MLuDKqkwFtd2tuMr.xml"
rss_articles = scrape_rss_feed(rss_url,limit=2)
for article in rss_articles:
    article_content = extract_article_content(article['link'])
    article['full_content'] = article_content
    article["type"] = "rss"

In [None]:
rss_results = batch_validate_articles(rss_articles)
    
# Print results
print(f"Found {len(rss_results['valid_articles'])} valid articles")
print(f"Found {len(rss_results['invalid_articles'])} invalid articles")

{'title': 'Das nächste Ampel-Projekt für Deutschland in Gefahr: Europas Hoffnungsträger ist insolvent', 'link': 'https://www.merkur.de/wirtschaft/das-naechste-ampel-projekt-fuer-deutschland-in-gefahr-europas-hoffnungstraeger-ist-insolvent-zr-93426508.html', 'description': '<div><img src="https://www.merkur.de/assets/images/36/277/36277472-northvolt-chef-peter-carlsson-tritt-zurueck-2Bfe.jpg" style="width: 100%;" /><div>Der Hersteller von Batterien aus Schweden, Northvolt, plant in Deutschland den Bau einer riesigen Fabrik. Jetzt ist das Unternehmen jedoch zahlungsunfähig. Welche Auswirkungen hat das auf das Bauprojekt?</div></div>', 'pubDate': 'Sat, 23 Nov 2024 09:11:00 GMT', 'creator': 'Von:', 'full_content': 'Das nächste Ampel-Projekt für Deutschland in Gefahr: Europas Hoffnungsträger ist insolvent Von: Amy Walker Drucken Teilen Der Hersteller von Batterien aus Schweden, Northvolt, plant in Deutschland den Bau einer riesigen Fabrik. Jetzt ist das Unternehmen jedoch zahlungsunfähig. W

#### BING NEWS

In [123]:
import requests
from datetime import datetime
import json
import os

def get_news(search_term=None, market='en-US', count=10):
    api_key = os.getenv("BING_API_KEY")
    """
    Get news using Bing News API
    
    Parameters:
    - api_key: Your Bing API key
    - search_term: What to search for (optional)
    - market: Market code (default 'en-US')
    - count: Number of results (default 10)
    """
    
    # Base endpoint for news search
    base_url = "https://api.bing.microsoft.com/v7.0/news"
    
    # If search term provided, use /search endpoint
    if search_term:
        base_url += "/search"
    
    # Request headers
    headers = {
        'Ocp-Apim-Subscription-Key': api_key,
        'Accept': 'application/json'
    }
    
    # Request parameters
    params = {
        'mkt': market,
        'count': count,
        'freshness': 'Day'  # Can be Day, Week, or Month
    }
    
    # Add search term if provided
    if search_term:
        params['q'] = search_term
    
    try:
        # Make the request
        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse the JSON response
        news_data = response.json()
        
        # Extract and format the news articles
        articles = []
        for article in news_data.get('value', []):
            articles.append({
                'title': article.get('name'),
                'description': article.get('description'),
                'url': article.get('url'),
                'published': article.get('datePublished'),
                'source': article.get('provider', [{}])[0].get('name'),
                'category': article.get('category', 'Uncategorized')
            })
        
        return articles
        
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON response: {e}")
        return None

bing_news = get_news(search_term="electric vehicles")
if bing_news:
    print("\nAI News Search Results:")
    for article in bing_news:
        print(f"\nTitle: {article['title']}")
        print(f"Source: {article['source']}")
        print(f"Description: {article['description']}")
        print(f"URL: {article['url']}")


AI News Search Results:

Title: Hyundai, Kia recall over 208,000 electric vehicles to fix problem that can cause loss of power
Source: The Associated Press - Business News on MSN.com
Description: Hyundai and Kia are recalling over 208,000 electric vehicles to fix a pesky problem that can cause loss of drive power, increasing the risk of a crash
URL: https://www.msn.com/en-us/autos/news/hyundai-kia-recall-over-208-000-electric-vehicles-to-fix-problem-that-can-cause-loss-of-power/ar-AA1uzwzV

Title: How Utah's electric vehicle charging prices compare to the rest of the US
Source: KSL
Description: Utah's gas prices are a slightly higher than the national average, but new data shows it has some of the lowest electric vehicle recharging prices.
URL: https://www.ksl.com/article/51195478/how-utahs-electric-vehicle-charging-prices-compare-to-the-rest-of-the-us

Title: Hyundai, Kia Recall Over 208,000 Electric Vehicles Over Loss of Power Issue
Source: Newsweek on MSN.com
Description: Hyundai a

In [126]:
for article in bing_news:
    article_content = extract_article_content(article['url'])
    article['full_content'] = article_content
    article["type"] = "bing_news"

print(len(bing_news))
bing_news = [article for article in bing_news if article['full_content'] != '' and article['full_content'] is not None]

print(len(bing_news))

Error processing https://www.drivespark.com/best-electric-bikes/: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.drivespark.com/best-electric-bikes/ on URL https://www.drivespark.com/best-electric-bikes/
9
4


In [None]:
bing_news_results = batch_validate_articles(bing_news)
    
# Print results
print(f"Found {len(bing_news_results['valid_articles'])} valid articles")
print(f"Found {len(bing_news_results['invalid_articles'])} invalid articles")

""" # Print details of valid articles
for article in bing_news_results['valid_articles']:
    print(f"\nValid Article: {article['title']}")
    print(f"Confidence: {article['validation_confidence']}")
    print(f"Reason: {article['validation_reason']}") """

{'title': "How Utah's electric vehicle charging prices compare to the rest of the US", 'description': "Utah's gas prices are a slightly higher than the national average, but new data shows it has some of the lowest electric vehicle recharging prices.", 'url': 'https://www.ksl.com/article/51195478/how-utahs-electric-vehicle-charging-prices-compare-to-the-rest-of-the-us', 'published': '2024-11-22T13:36:00.0000000Z', 'source': 'KSL', 'category': 'Uncategorized', 'full_content': 'Estimated read time: 2-3 minutes SALT LAKE CITY — At $3.11 per gallon, Utah\'s average cost for regular gasoline is one of the more expensive in the U.S. a week before Thanksgiving, about a nickel higher than the national average, according to AAA. However, with an average of about $0.29 per kilowatt-hour, the organization says the Beehive State is home to some of the cheapest electric vehicle prices in the nation right now. AAA announced Thursday that it will now track electric vehicle charging costs as part of i

In [139]:
bing_news_results["valid_articles"]

[{'title': "How Utah's electric vehicle charging prices compare to the rest of the US",
  'description': "Utah's gas prices are a slightly higher than the national average, but new data shows it has some of the lowest electric vehicle recharging prices.",
  'url': 'https://www.ksl.com/article/51195478/how-utahs-electric-vehicle-charging-prices-compare-to-the-rest-of-the-us',
  'published': '2024-11-22T13:36:00.0000000Z',
  'source': 'KSL',
  'category': 'Uncategorized',
  'full_content': 'Estimated read time: 2-3 minutes SALT LAKE CITY — At $3.11 per gallon, Utah\'s average cost for regular gasoline is one of the more expensive in the U.S. a week before Thanksgiving, about a nickel higher than the national average, according to AAA. However, with an average of about $0.29 per kilowatt-hour, the organization says the Beehive State is home to some of the cheapest electric vehicle prices in the nation right now. AAA announced Thursday that it will now track electric vehicle charging costs

#### NEWSPAPER API

In [92]:
import os
import requests
from datetime import datetime
from typing import List, Dict

def fetch_news(query="electric vehicles", n=3) -> List[Dict]:
    api_key = os.getenv("NEWS_API_KEY")
    base_url = "https://newsapi.org/v2/everything"
    
    params = {
        "q": query,
        "language": "en",
        "sortBy": "popularity",
        "pageSize": n
    }
    
    headers = {
        "X-Api-Key": api_key
    }
    
    try:
        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data["status"] == "ok":
            print(f"Found {data['totalResults']} articles\n")
            for article in data["articles"]:
                print(f"Title: {article['title']}")
                print(f"Url: {article['url']}")
        
        else:
            print("No articles found")

        return data["articles"]
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching news: {e}")
        return []

In [93]:
newsapi_articles = fetch_news(n=10)
print(newsapi_articles)
# filter if title is [Removed]
newsapi_articles = [article for article in newsapi_articles if "[Removed]" not in article["title"]]
# these news are from the newsapi so add "type" key to each article

for article in newsapi_articles:
    print(article["source"])
    print(article["url"])
    article_content = extract_article_content(article['url'])
    article['full_content'] = article_content
    article["type"] = "newsapi"

Found 4606 articles

Title: Kia unveils two new EV designs, including a nifty modular van
Url: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_555241a9-ca46-4418-917e-7c1b84787db0
Title: [Removed]
Url: https://removed.com
Title: Ford's free replacement for its Telsa Supercharger adapter starts shipping today
Url: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_9cf53f52-6872-4c7a-923b-192cfa4e25f4
Title: [Removed]
Url: https://removed.com
Title: [Removed]
Url: https://removed.com
Title: The 40 Black Friday tech deals worth shopping from Amazon, Walmart, Apple, Anker and others
Url: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_b68b9c5c-07ab-4b44-bfe2-40dec39c65aa
Title: Trump Won. What Will Happen to Electric Vehicles?
Url: https://www.wired.com/story/trump-won-what-happens-to-electric-vehicles-now/
Title: This election will decide what kind of car you’ll buy
Url: https://www.theverge.com/24279434/presidential-election-trump-bid

KeyboardInterrupt: 

#### AGGREGATE BING NEWS API AND RSS

In [140]:
def parse_date(date_str: str) -> datetime:
    """
    Parse different date formats into datetime object
    """
    try:
        # Try ISO format (from Bing News API)
        date_obj = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
        return date_obj.strftime('%Y-%m-%d %H:%M:%S')
    except (ValueError, AttributeError):
        try:
            # Try RSS format
            date_obj = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S %Z')
            return date_obj.strftime('%Y-%m-%d %H:%M:%S')
        except (ValueError, AttributeError):
            return None
        
def normalize_article(article: Dict, source: str) -> Dict:
    """
    Normalize article data from different sources into a consistent format
    
    Args:
        article (Dict): Raw article data
        source (str): Source of the article ('newsapi' or 'rss')
        
    Returns:
        Dict: Normalized article data
    """
    if source == 'bing':
        return {
            'title': article.get('title'),
            'description': article.get('description'),
            'url': article.get('url'),
            'image_url': None,  # Bing News API doesn't provide image URL in basic response
            'published_date': parse_date(article.get('published')),
            'source': article.get('source'),
            'author': None,  # Bing News API doesn't provide author in basic response
            'content': article.get('full_content'),
            'category': article.get('category'),
            'data_source': 'bing'
        }
    elif source == 'rss':
        return {
            'title': article.get('title'),
            'description': article.get('description'),
            'url': article.get('link'),
            'image_url': None,
            'published_date': parse_date(article.get('pubDate')),
            'source': article.get('creator'),
            'author': article.get('author'),
            'content': article.get('full_content'),
            'category': None,  # RSS typically doesn't include category
            'data_source': 'rss'
        }
    
    return None

# When processing your articles:
normalized_articles = []

# For NewsAPI articles
for article in bing_news_results["valid_articles"]:
    normalized = normalize_article(article, 'bing')
    if normalized:
        normalized_articles.append(normalized)

# For RSS articles
for article in rss_results['valid_articles']:
    normalized = normalize_article(article, 'rss')
    if normalized:
        normalized_articles.append(normalized)

In [149]:
import pandas as pd
df = pd.DataFrame(normalized_articles)

# Sort by date if needed
df = df.sort_values('published_date', ascending=False).reset_index(drop=True)
df.to_csv("data/news_articles.csv", index=False)

df

In [147]:
df

Unnamed: 0,title,description,url,image_url,published_date,source,author,content,category,data_source
0,Das nächste Ampel-Projekt für Deutschland in G...,"<div><img src=""https://www.merkur.de/assets/im...",https://www.merkur.de/wirtschaft/das-naechste-...,,2024-11-23 09:11:00,Von:,,Das nächste Ampel-Projekt für Deutschland in G...,,rss
1,"Hyundai, Kia recall more than 208K electric ve...","(AP Photo/Gene J. Puskar, File) DETROIT (AP) —...",https://www.ironmountaindailynews.com/news/202...,,2024-11-23 00:00:00,The Iron Mountain Daily News,,DETROIT (AP) — Hyundai and Kia are recalling o...,Uncategorized,bing
2,Ferrari has hydrogen and Lamborghini is full e...,"It’s been a well-kept secret, but it seems the...",https://www.riazor.org/news/hydrogen-lamborghi...,,2024-11-22 22:03:00,riazor,,"It’s been a well-kept secret, but it seems the...",Uncategorized,bing
3,Best Auto Loans and Financing of December 2024,"A car is an expensive purchase, but choosing t...",https://www.cnbc.com/select/best-car-loans/,,2024-11-22 18:36:00,CNBC,,Compare offers to find the best auto loan Best...,Uncategorized,bing
4,How Utah's electric vehicle charging prices co...,Utah's gas prices are a slightly higher than t...,https://www.ksl.com/article/51195478/how-utahs...,,2024-11-22 13:36:00,KSL,,Estimated read time: 2-3 minutes SALT LAKE CIT...,Uncategorized,bing


In [1]:
from news_collector.bingnews import get_bing_news
bing_news = get_bing_news()

Fetching Bing news for 'electric vehicles'... for the market de-DE, cc DE, sortBy Relevance
{'title': 'Is the Philippines EV car market ready for takeoff?', 'description': "For the first time in the Philippines, an electric vehicle is being sold via live selling. It's just one of the reasons why the EV car market may soon finally take off here.", 'url': 'https://www.rappler.com/business/philippines-electric-vehicle-car-market-ready-takeoff/', 'published': '2024-11-23T10:18:00.0000000Z', 'source': 'Rappler', 'category': 'Business', 'full_content': "This is AI generated summarization, which may have errors. For context, always refer to the full article. For the first time in the Philippines, an electric vehicle is being sold via live selling. It's just one of the reasons why the EV car market may soon finally take off here. MANILA, Philippines – For the first time in Philippine history, Vietnamese cars are now being sold in the country, a sign of how our once-backward, war-torn neighbor 

In [2]:
from news_collector.rss import get_rss_articles

rss_news = get_rss_articles()

Fetching RSS feed from https://rss.app/feeds/MLuDKqkwFtd2tuMr.xml...
{'title': 'Sommerreifen im Winter? Diese Strafen drohen Autofahrern', 'link': 'https://www.stern.de/auto/sommerreifen-im-winter--das-muessen-autofahrer-jetzt-wissen--7786260.html', 'description': '<div><img src="https://image.stern.de/7786256/t/kC/v4/w1440/r1.7778/-/autoreifen-im-winter.jpg" style="width: 100%;" /><div>\u200bWinterreifen sorgen bei Kälte, Eis und Schnee für mehr Sicherheit auf den Straßen. Doch was passiert, wenn man im Winter mit Sommerreifen unterwegs ist?</div></div>', 'pubDate': 'Sat, 23 Nov 2024 12:08:00 GMT', 'creator': 'STERN.de', 'full_content': 'Winterreifen sorgen bei Kälte, Eis und Schnee für mehr Sicherheit auf den Straßen. Doch was passiert, wenn man im Winter mit Sommerreifen unterwegs ist? Grundsätzlich besteht in Deutschland keine Winterreifenpflicht. Jedoch ist die Nutzung von Sommerreifen bei winterlichen Straßenverhältnissen verboten – das gilt auch für im Ausland zugelassene Fahrze

In [2]:
import pandas as pd
df = pd.concat([pd.DataFrame(bing_news), pd.DataFrame(rss_news)], ignore_index=True)
# Sort by date if needed
df = df.sort_values('published_date', ascending=False).reset_index(drop=True)
df

NameError: name 'rss_news' is not defined