# News Headlines Data Collection

This notebook demonstrates how to collect news headlines from RSS feeds for the topic classification project.

**Important**: Always respect website terms of service and implement appropriate delays between requests to avoid overloading servers.

In [None]:
import os
import time
import requests
import pandas as pd
from datetime import datetime
from typing import List, Dict, Optional
import xml.etree.ElementTree as ET
from urllib.parse import urljoin
import feedparser
import warnings
warnings.filterwarnings('ignore')

# Add project src to path
import sys
sys.path.append('../src')

from utils import clean_text, setup_logging

logger = setup_logging()

## RSS Feed URLs

Define RSS feed URLs for different news sources and topics. These are publicly available RSS feeds that can be used for educational purposes.

In [None]:
# RSS feed URLs organized by source and topic
RSS_FEEDS = {
    'bbc': {
        'politics': 'http://feeds.bbci.co.uk/news/politics/rss.xml',
        'technology': 'http://feeds.bbci.co.uk/news/technology/rss.xml',
        'business': 'http://feeds.bbci.co.uk/news/business/rss.xml',
        'sport': 'http://feeds.bbci.co.uk/sport/rss.xml',
        'health': 'http://feeds.bbci.co.uk/news/health/rss.xml'
    },
    'reuters': {
        'politics': 'https://feeds.reuters.com/reuters/politicsNews',
        'technology': 'https://feeds.reuters.com/reuters/technologyNews',
        'business': 'https://feeds.reuters.com/reuters/businessNews',
        'sport': 'https://feeds.reuters.com/reuters/sportsNews',
        'health': 'https://feeds.reuters.com/reuters/healthNews'
    }
}

# Display available feeds
print("Available RSS Feeds:")
for source, topics in RSS_FEEDS.items():
    print(f"\n{source.upper()}:")
    for topic, url in topics.items():
        print(f"  {topic}: {url}")

## Data Collection Functions

In [None]:
def fetch_rss_feed(url: str, timeout: int = 10) -> Optional[feedparser.FeedParserDict]:
    """
    Fetch and parse an RSS feed.
    
    Args:
        url (str): RSS feed URL.
        timeout (int): Request timeout in seconds.
    
    Returns:
        Optional[feedparser.FeedParserDict]: Parsed feed or None if failed.
    """
    try:
        # Set user agent to be respectful
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; NewsClassifier/1.0; Educational Use)'
        }
        
        # Parse feed
        feed = feedparser.parse(url)
        
        if feed.bozo:
            logger.warning(f"Warning: Feed parsing issues for {url}: {feed.bozo_exception}")
        
        return feed
        
    except Exception as e:
        logger.error(f"Error fetching feed {url}: {str(e)}")
        return None


def extract_headlines_from_feed(
    feed: feedparser.FeedParserDict, 
    topic: str, 
    source: str,
    max_items: int = None
) -> List[Dict[str, str]]:
    """
    Extract headlines from a parsed RSS feed.
    
    Args:
        feed (feedparser.FeedParserDict): Parsed RSS feed.
        topic (str): Topic category.
        source (str): News source name.
        max_items (int, optional): Maximum number of items to extract.
    
    Returns:
        List[Dict[str, str]]: List of headline data.
    """
    headlines = []
    
    items = feed.entries[:max_items] if max_items else feed.entries
    
    for entry in items:
        # Extract headline (title)
        headline = entry.get('title', '').strip()
        
        if not headline:
            continue
        
        # Extract additional information
        description = entry.get('summary', '').strip()
        link = entry.get('link', '')
        published = entry.get('published', '')
        
        # Clean headline
        cleaned_headline = clean_text(headline)
        
        if len(cleaned_headline) > 5:  # Filter out very short headlines
            headlines.append({
                'headline': headline,
                'cleaned_headline': cleaned_headline,
                'topic': topic,
                'source': source,
                'description': description,
                'link': link,
                'published': published,
                'collected_at': datetime.now().isoformat()
            })
    
    return headlines


def collect_headlines_from_feeds(
    rss_feeds: Dict[str, Dict[str, str]],
    max_per_topic: int = 100,
    delay_between_requests: float = 1.0
) -> pd.DataFrame:
    """
    Collect headlines from multiple RSS feeds.
    
    Args:
        rss_feeds (Dict[str, Dict[str, str]]): Dictionary of RSS feed URLs.
        max_per_topic (int): Maximum headlines per topic.
        delay_between_requests (float): Delay between requests in seconds.
    
    Returns:
        pd.DataFrame: DataFrame containing collected headlines.
    """
    all_headlines = []
    
    for source, topics in rss_feeds.items():
        logger.info(f"Collecting headlines from {source.upper()}")
        
        for topic, url in topics.items():
            logger.info(f"  Fetching {topic} headlines...")
            
            # Fetch feed
            feed = fetch_rss_feed(url)
            
            if feed is None:
                logger.warning(f"  Failed to fetch {topic} from {source}")
                continue
            
            # Extract headlines
            headlines = extract_headlines_from_feed(
                feed, topic, source, max_items=max_per_topic
            )
            
            all_headlines.extend(headlines)
            logger.info(f"  Collected {len(headlines)} {topic} headlines from {source}")
            
            # Be respectful - add delay between requests
            time.sleep(delay_between_requests)
    
    # Create DataFrame
    df = pd.DataFrame(all_headlines)
    
    logger.info(f"Total headlines collected: {len(df)}")
    
    return df


# Test the functions
print("Functions defined successfully!")

## Test RSS Feed Access

Let's test accessing one RSS feed to make sure everything works.

In [None]:
# Test with BBC Technology feed
test_url = RSS_FEEDS['bbc']['technology']
print(f"Testing RSS feed: {test_url}")

test_feed = fetch_rss_feed(test_url)

if test_feed:
    print(f"\nFeed title: {test_feed.feed.get('title', 'N/A')}")
    print(f"Feed description: {test_feed.feed.get('description', 'N/A')}")
    print(f"Number of entries: {len(test_feed.entries)}")
    
    # Show first few headlines
    print("\nSample headlines:")
    for i, entry in enumerate(test_feed.entries[:5]):
        print(f"{i+1}. {entry.get('title', 'No title')}")
else:
    print("Failed to fetch test feed")

## Collect Full Dataset

Now let's collect headlines from all configured RSS feeds. 

**Note**: This will make multiple requests to news websites. Please be respectful:
- We include delays between requests
- We use appropriate user agent strings
- We only collect what we need for educational purposes

In [None]:
# Configuration for data collection
CONFIG = {
    'max_headlines_per_topic': 150,  # Collect a bit more than needed
    'delay_between_requests': 2.0,   # 2 second delay between requests
    'target_topics': ['politics', 'technology', 'business', 'sport']  # Select 4 topics
}

print(f"Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

print("\nStarting data collection...")
print(f"This will take approximately {len(RSS_FEEDS) * len(CONFIG['target_topics']) * CONFIG['delay_between_requests'] / 60:.1f} minutes")

In [None]:
# Filter RSS feeds to only include target topics
filtered_feeds = {}
for source, topics in RSS_FEEDS.items():
    filtered_feeds[source] = {}
    for topic in CONFIG['target_topics']:
        if topic in topics:
            filtered_feeds[source][topic] = topics[topic]

print("Filtered RSS feeds:")
for source, topics in filtered_feeds.items():
    print(f"\n{source.upper()}:")
    for topic, url in topics.items():
        print(f"  {topic}: {url}")

In [None]:
# Collect the data
start_time = time.time()

df_collected = collect_headlines_from_feeds(
    rss_feeds=filtered_feeds,
    max_per_topic=CONFIG['max_headlines_per_topic'],
    delay_between_requests=CONFIG['delay_between_requests']
)

end_time = time.time()
duration = end_time - start_time

print(f"\nData collection completed in {duration:.1f} seconds")
print(f"Total headlines collected: {len(df_collected)}")

## Data Analysis and Cleaning

In [None]:
# Analyze the collected data
print("Dataset Overview:")
print(f"Total samples: {len(df_collected)}")
print(f"Columns: {list(df_collected.columns)}")

print("\nSamples per topic:")
topic_counts = df_collected['topic'].value_counts()
print(topic_counts)

print("\nSamples per source:")
source_counts = df_collected['source'].value_counts()
print(source_counts)

print("\nTopic-Source distribution:")
crosstab = pd.crosstab(df_collected['topic'], df_collected['source'])
print(crosstab)

In [None]:
# Show sample headlines for each topic
print("Sample headlines by topic:")
for topic in df_collected['topic'].unique():
    print(f"\n{topic.upper()}:")
    sample_headlines = df_collected[df_collected['topic'] == topic]['headline'].head(3)
    for i, headline in enumerate(sample_headlines, 1):
        print(f"  {i}. {headline}")

In [None]:
# Clean and prepare the dataset
print("Cleaning dataset...")

# Remove duplicates based on cleaned headline
initial_count = len(df_collected)
df_cleaned = df_collected.drop_duplicates(subset=['cleaned_headline'], keep='first')
print(f"Removed {initial_count - len(df_cleaned)} duplicate headlines")

# Remove very short or very long headlines
headline_lengths = df_cleaned['cleaned_headline'].str.split().str.len()
df_cleaned = df_cleaned[(headline_lengths >= 3) & (headline_lengths <= 50)]
print(f"Removed headlines outside 3-50 word range. Remaining: {len(df_cleaned)}")

# Balance the dataset - take equal samples from each topic
min_samples = df_cleaned['topic'].value_counts().min()
target_samples_per_topic = min(min_samples, 500)  # Max 500 per topic

print(f"\nBalancing dataset to {target_samples_per_topic} samples per topic...")

balanced_dfs = []
for topic in df_cleaned['topic'].unique():
    topic_df = df_cleaned[df_cleaned['topic'] == topic].sample(
        n=target_samples_per_topic, 
        random_state=42
    )
    balanced_dfs.append(topic_df)

df_final = pd.concat(balanced_dfs, ignore_index=True)

print(f"Final dataset size: {len(df_final)}")
print("\nFinal topic distribution:")
print(df_final['topic'].value_counts())

## Save the Dataset

In [None]:
# Create directories
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Save raw collected data
raw_file = '../data/raw/headlines_raw.csv'
df_collected.to_csv(raw_file, index=False)
print(f"Raw data saved to: {raw_file}")

# Save processed data for training
processed_file = '../data/processed/headlines.csv'
# Select only required columns for training
df_training = df_final[['headline', 'topic']].copy()
df_training.to_csv(processed_file, index=False)
print(f"Processed data saved to: {processed_file}")

# Save full processed data with metadata
full_processed_file = '../data/processed/headlines_full.csv'
df_final.to_csv(full_processed_file, index=False)
print(f"Full processed data saved to: {full_processed_file}")

## Dataset Statistics

In [None]:
# Calculate and display dataset statistics
print("Final Dataset Statistics:")
print("=" * 40)

print(f"Total samples: {len(df_training)}")
print(f"Number of topics: {df_training['topic'].nunique()}")
print(f"Topics: {sorted(df_training['topic'].unique())}")

# Headline length statistics
headline_words = df_training['headline'].str.split().str.len()
print(f"\nHeadline length statistics:")
print(f"  Mean: {headline_words.mean():.1f} words")
print(f"  Median: {headline_words.median():.1f} words")
print(f"  Min: {headline_words.min()} words")
print(f"  Max: {headline_words.max()} words")

# Vocabulary size estimate
all_words = ' '.join(df_training['headline']).lower().split()
unique_words = set(all_words)
print(f"\nEstimated vocabulary size: {len(unique_words)} unique words")

print("\nDataset is ready for training!")
print(f"You can now run the training script with: python src/train.py")

## Alternative: Create Sample Data

If you cannot access RSS feeds, you can create sample data for testing:

In [None]:
# Uncomment and run this cell if you want to create sample data instead

# from utils import create_sample_data

# print("Creating sample data for testing...")
# create_sample_data(
#     output_path='../data/processed/headlines.csv',
#     num_samples_per_topic=200
# )
# print("Sample data created successfully!")