In [60]:
# Core libraries for scraping and data handling
import requests                     # Send HTTP requests to websites
from bs4 import BeautifulSoup       # Parse HTML and extract content
import pandas as pd                 # Create and manipulate data tables
from datetime import datetime       # Handle date/time formats

# Libraries for keyword extraction and NLP
from textblob import TextBlob       # For basic natural language processing
import nltk                         # Required by textblob and rake-nltk
from rake_nltk import Rake          # Extract key phrases from text

In [61]:
# --- REQUEST NPR NEWS PAGE ---

# NPR news homepage we'll scrape headlines from
url = "https://www.npr.org/sections/news/"

# Grab the raw HTML of the page
response = requests.get(url)

# Simple check to make sure the request actually worked
if response.status_code != 200:
    raise Exception(f"Failed to fetch NPR page: {response.status_code}")

# --- PARSE HTML ---

# Parse the page content so we can search through the HTML
soup = BeautifulSoup(response.content, 'html.parser')

# Pull out all article blocks (each should contain a headline)
articles = soup.find_all('article')

In [62]:
# --- EXTRACT DATA FROM EACH ARTICLE ---

headline_data = []

for article in articles:
    # Get the title text from the <h2> tag
    title_tag = article.find('h2', class_='title')
    title = title_tag.text.strip() if title_tag else None

    # Extract the article URL from the <a> tag inside the title
    link_tag = title_tag.find('a') if title_tag else None
    url = link_tag['href'] if link_tag else None

    # Try to grab the published date (not always present)
    date_tag = article.find('time')
    if date_tag and date_tag.has_attr('datetime'):
        date_published = date_tag['datetime']
    else:
        date_published = None

    # Skip articles that are missing key info
    if not title or not url:
        continue

    # Save the clean, structured data
    headline_data.append({
        'title': title,
        'url': url,
        'date_published': date_published
    })

In [63]:
# --- BUILD FINAL DATAFRAME ---

# Turn our list of scraped headlines into a structured DataFrame
df_npr = pd.DataFrame(headline_data)

# Take a quick look at the first few entries
print("üìã NPR News Headlines Preview:")
print(df_npr.head(10))

üìã NPR News Headlines Preview:
                                                                              title  \
0       This art exhibition is 'divisive' or 'eye-opening' ‚Äî it depends who you ask   
1                              Tesla's quarterly sales fall 13%. Experts blame Musk   
2  Legal scholar sees immigrant arrests as a 'struggle for the soul of the country'   
3                     See the moment Sen. Cory Booker broke Strom Thurmond's record   
4            5 takeaways from Tuesday's elections, including bad news for Elon Musk   
5        Federal judge drops corruption case against New York City Mayor Eric Adams   
6               How do 'torpedo bats' work? We asked baseball physicists to explain   
7    How reciprocal tariffs could affect you. And, Maryland man mistakenly deported   
8              Latest Alzheimer's lab tests focus on memory loss, not brain plaques   
9               Today is Trump's 'Liberation Day.' What does that mean for tariffs?   

       

In [64]:
# --- IMPORT FOR RSS PARSING ---
import feedparser  # Lightweight library for parsing RSS feeds

# BBC's RSS feed for top stories (you can swap in other sections too)
bbc_rss_url = "https://feeds.bbci.co.uk/news/rss.xml"

# Parse the feed into a usable Python object
feed = feedparser.parse(bbc_rss_url)


In [65]:
# --- PARSE RSS FEED INTO STRUCTURED DATA ---

headline_data_bbc = []

# Loop through each story in the BBC RSS feed
for entry in feed.entries:
    title = entry.title
    url = entry.link
    date_published = entry.published if 'published' in entry else None

    # Save the key info we care about
    headline_data_bbc.append({
        'title': title,
        'url': url,
        'date_published': date_published
    })

# Convert the list into a DataFrame so we can work with it
df_bbc = pd.DataFrame(headline_data_bbc)

In [66]:
# --- DISPLAY BBC HEADLINES PREVIEW ---

# Quick look at what we pulled from the BBC RSS feed
print("üìã BBC RSS Headlines Preview:")
print(df_bbc.head(10))

üìã BBC RSS Headlines Preview:
                                                                                     title  \
0  Inside Mandalay: BBC finds huge devastation and little help for Myanmar quake survivors   
1                                 Three big unknowns ahead of Trump's tariffs announcement   
2                         Survivor challenges Israeli account of attack on Gaza paramedics   
3                  Man charged with 64 offences as part of investigation into funeral home   
4                      Heathrow warned by airlines about power supply days before shutdown   
5                                  Women to continue having babies later in life, says ONS   
6                                         Hollywood remembers 'wonderful' actor Val Kilmer   
7                                              Tesla sales plunge after Elon Musk backlash   
8          Private school parents must pay fair share, court hearing VAT challenge is told   
9                   Muse can

In [67]:
# --- DEFINE CATEGORY KEYWORDS ---

# Simple keyword map to help classify each headline into a topic
category_keywords = {
    "politics": ["election", "biden", "trump", "congress", "senate", "vote", "president", "law", "policy"],
    "economy": ["economy", "inflation", "stock", "market", "finance", "bank", "job", "unemployment"],
    "sports": ["football", "nba", "mlb", "soccer", "olympics", "athlete", "game", "match", "team"],
    "technology": ["tech", "ai", "artificial intelligence", "robot", "software", "startup", "app", "cyber"],
    "world": ["russia", "ukraine", "china", "india", "global", "foreign", "diplomacy", "embassy", "war"],
    "health": ["covid", "vaccine", "hospital", "health", "medicine", "doctor", "disease", "virus"]
}

In [68]:
# --- FUNCTION TO CATEGORIZE HEADLINE BY KEYWORDS ---

def categorize_headline(title):
    title_lower = title.lower()

    # Check if any category keywords appear in the headline
    for category, keywords in category_keywords.items():
        if any(keyword in title_lower for keyword in keywords):
            return category

    # If nothing matches, drop it in the 'other' bucket
    return "other"

In [23]:
# --- APPLY TO NPR ---
df_npr['category'] = df_npr['title'].apply(categorize_headline)

# --- APPLY TO BBC ---
df_bbc['category'] = df_bbc['title'].apply(categorize_headline)

In [24]:
# --- PREVIEW CATEGORY RESULTS ---
print("üß† NPR Category Breakdown:")
print(df_npr['category'].value_counts())

print("\nüß† BBC Category Breakdown:")
print(df_bbc['category'].value_counts())

üß† NPR Category Breakdown:
category
other         9
technology    8
politics      6
world         1
Name: count, dtype: int64

üß† BBC Category Breakdown:
category
other         24
technology     5
politics       4
sports         2
world          2
Name: count, dtype: int64


In [69]:
# --- RAKE-BASED HEADLINE SUMMARIZER ---

def summarize_with_rake(df):
    summaries = {}
    rake = Rake()  # Pulls key phrases using built-in stopwords

    for category in df['category'].unique():
        # Grab all titles in this category
        headlines = df[df['category'] == category]['title'].tolist()
        if not headlines:
            continue

        # Merge all headlines into a single string for analysis
        combined_text = " ".join(headlines)

        # Run RAKE to extract top-ranked phrases
        rake.extract_keywords_from_text(combined_text)
        phrases = rake.get_ranked_phrases_with_scores()

        # Grab the top 3‚Äì5 keyword phrases
        top_phrases = [phrase for score, phrase in phrases[:5]]

        # Build the summary sentence
        summary = f"Top {category} headlines focus on: " + ", ".join(top_phrases) + "."
        summaries[category] = summary

    return summaries

In [53]:
print("üì° NPR RAKE Summary:")
npr_rake = summarize_with_rake(df_npr)
for cat, summary in npr_rake.items():
    print(f"\nüóÇÔ∏è {cat.upper()}:\nüìù {summary}")

print("\n\nüì° BBC RAKE Summary:")
bbc_rake = summarize_with_rake(df_bbc)
for cat, summary in bbc_rake.items():
    print(f"\nüóÇÔ∏è {cat.upper()}:\nüìù {summary}")

üì° NPR RAKE Summary:

üóÇÔ∏è OTHER:
üìù Top other headlines focus on: great jasmine mooney left ice detention, espionage revelations hhs layoffs hit meals, cory booker broke strom thurmond, ousted myanmar leader speaks, maryland man mistakenly deported.

üóÇÔ∏è POLITICS:
üìù Top politics headlines focus on: needed fema aid violates, flip wisconsin supreme court, elections giving republicans cause, danish prime minister heads, liberation day .'.

üóÇÔ∏è TECHNOLOGY:
üìù Top technology headlines focus on: new york city mayor eric adams, 65 gop retains two house seats, medicaid funds judge orders white house, federal judge drops corruption case, unaccompanied migrant children val kilmer.

üóÇÔ∏è WORLD:
üìù Top world headlines focus on: global electric vehicle boom, china.


üì° BBC RAKE Summary:

üóÇÔ∏è OTHER:
üìù Top other headlines focus on: myanmar quake survivors survivor challenges israeli account, tax charges us prosecutors seek death penalty, river thames muse cancel tu

In [57]:
# --- CONVERT SUMMARY DICTIONARY TO DATAFRAME ---
def format_summaries_to_df(source_name, summary_dict):
    return pd.DataFrame([
        {"source": source_name, "category": category, "summary": summary}
        for category, summary in summary_dict.items()
    ])

df_summary_npr = format_summaries_to_df("NPR", npr_rake)
df_summary_bbc = format_summaries_to_df("BBC", bbc_rake)
df_all_summaries = pd.concat([df_summary_npr, df_summary_bbc], ignore_index=True)

pd.set_option('display.max_colwidth', None)
df_all_summaries

Unnamed: 0,source,category,summary
0,NPR,other,"Top other headlines focus on: great jasmine mooney left ice detention, espionage revelations hhs layoffs hit meals, cory booker broke strom thurmond, ousted myanmar leader speaks, maryland man mistakenly deported."
1,NPR,politics,"Top politics headlines focus on: needed fema aid violates, flip wisconsin supreme court, elections giving republicans cause, danish prime minister heads, liberation day .'."
2,NPR,technology,"Top technology headlines focus on: new york city mayor eric adams, 65 gop retains two house seats, medicaid funds judge orders white house, federal judge drops corruption case, unaccompanied migrant children val kilmer."
3,NPR,world,"Top world headlines focus on: global electric vehicle boom, china."
4,BBC,other,"Top other headlines focus on: myanmar quake survivors survivor challenges israeli account, tax charges us prosecutors seek death penalty, river thames muse cancel turkish gig, postecoglou hopes pochettino gets spurs return, mangione au revoir marine le pen."
5,BBC,politics,"Top politics headlines focus on: three big unknowns ahead, reshape global economy, liberation day, tariffs announcement, tariffs analysis."
6,BBC,technology,"Top technology headlines focus on: uk awaits tariff decision bbc news app, shutdown chris mason, power supply days, jones says heart, club scene changing."
7,BBC,sports,"Top sports headlines focus on: roblox nintendo announces switch 2 release date, parents allowed, block children, new games, games."
8,BBC,world,"Top world headlines focus on: chinese victims tell bbc, stop people touching, serial rapist dublin, molly malone statue, get stewards."


In [59]:
# --- SAVE CSV FOR DATA ANALYSIS USERS ---
df_all_summaries.to_csv("news_summaries.csv", index=False)
print("‚úÖ news_summaries.csv saved in project root.")

# Optional: enable download if in Colab
try:
    from google.colab import files
    files.download("news_summaries.csv")
except:
    pass  # Ignore if not in Colab


‚úÖ news_summaries.csv saved in project root.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>