<a href="https://colab.research.google.com/github/KiranManecode/MLProjects/blob/main/web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install feedparser langdetect

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: langdetect, sgmllib3k
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=02a79278ed3312861f612af0b55e4c4e5f7b1c8315cb1416ba1f381b7465f289
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca

In [7]:
# ------------------------- Imports -------------------------

import feedparser            # For parsing RSS feeds
import sqlite3               # For SQLite database
from datetime import datetime  # For timestamping articles
from langdetect import detect, LangDetectException  # For language detection

# ------------------------- RSS Feed Configuration -------------------------

# List of international news sources with RSS feed URLs
RSS_FEEDS = [
    {"country": "USA", "source": "CNN", "url": "http://rss.cnn.com/rss/edition.rss"},
    {"country": "UK", "source": "BBC", "url": "http://feeds.bbci.co.uk/news/rss.xml"},
    {"country": "Japan", "source": "NHK", "url": "https://www3.nhk.or.jp/rss/news/cat0.xml"},
    {"country": "India", "source": "Times of India", "url": "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms"},
    {"country": "Canada", "source": "The Globe And Mail", "url": "https://www.theglobeandmail.com/rss/canada/"},
    {"country": "Australia", "source": "ABC News", "url": "https://www.abc.net.au/news/feed/51120/rss.xml"},
    {"country": "Germany", "source": "DW News", "url": "https://rss.dw.com/rdf/rss-en-all"},
    {"country": "France", "source": "France24", "url": "https://www.france24.com/en/rss"},
    {"country": "Russia", "source": "RT", "url": "https://www.rt.com/rss/news/"},
    {"country": "China", "source": "Xinhua", "url": "http://www.xinhuanet.com/english/rss/worldrss.xml"},
    {"country": "Singapore", "source": "CNA", "url": "https://www.channelnewsasia.com/rssfeeds"},
    {"country": "Malaysia", "source": "The Star", "url": "https://www.thestar.com.my/rss/editors-choice"},
    {"country": "South Korea", "source": "Korea Times", "url": "https://www.koreatimes.co.kr/www/rss/rss.xml"},
    {"country": "Brazil", "source": "Globo", "url": "https://g1.globo.com/rss/g1/"},
    {"country": "South Africa", "source": "News24", "url": "https://www.news24.com/rss"},
    {"country": "UAE", "source": "Gulf News", "url": "https://gulfnews.com/rss?generator=true"},
    {"country": "Qatar", "source": "Al Jazeera", "url": "https://www.aljazeera.com/xml/rss/all.xml"},
    {"country": "Italy", "source": "ANSA", "url": "https://www.ansa.it/sito/ansait_rss.xml"},
    {"country": "Spain", "source": "El País", "url": "https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada"},
    {"country": "Indonesia", "source": "Jakarta Post", "url": "https://www.thejakartapost.com/rss"}
]

# SQLite database file name
DB_FILE = "news_articles.db"

# ------------------------- Language Detection -------------------------

def detect_language(text):
    """
    Detect the language of the given text using langdetect.
    Returns 'unknown' if detection fails.
    """
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# ------------------------- RSS Feed Parsing -------------------------

def parse_feed(feed_info):
    """
    Parse a single RSS feed and return a list of articles with metadata.
    """
    print(f"Fetching: {feed_info['source']} ({feed_info['country']})")
    try:
        feed = feedparser.parse(feed_info["url"])
        articles = []
        for entry in feed.entries:
            title = entry.get("title", "").strip()
            summary = entry.get("summary", "").strip()
            article = {
                "title": title,
                "published": entry.get("published", "").strip(),
                "source": feed_info["source"],
                "country": feed_info["country"],
                "summary": summary,
                "url": entry.get("link", "").strip(),
                "fetched_at": datetime.utcnow().isoformat(),
                "language": detect_language(title or summary)
            }
            articles.append(article)
        return articles
    except Exception as e:
        print(f"Error parsing {feed_info['url']}: {e}")
        return []

# ------------------------- Database Initialization -------------------------

def init_db(conn):
    """
    Create the articles table in the database if it doesn't already exist.
    """
    conn.execute('''
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            published TEXT,
            source TEXT,
            country TEXT,
            summary TEXT,
            url TEXT UNIQUE,
            fetched_at TEXT,
            language TEXT
        )
    ''')
    conn.commit()

# ------------------------- Save Articles to Database -------------------------

def save_to_db(articles, conn):
    """
    Insert collected articles into the database.
    Uses INSERT OR IGNORE to skip duplicates.
    """
    cursor = conn.cursor()
    count = 0
    for article in articles:
        try:
            cursor.execute('''
                INSERT OR IGNORE INTO articles
                (title, published, source, country, summary, url, fetched_at, language)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                article["title"], article["published"], article["source"],
                article["country"], article["summary"], article["url"],
                article["fetched_at"], article["language"]
            ))
            count += cursor.rowcount
        except Exception as e:
            print(f"Error saving article: {e}")
    conn.commit()
    print(f"Inserted {count} new articles.")

# ------------------------- Main Script Entry Point -------------------------

def main():
    """
    Main function to fetch all RSS feeds, parse articles, and save them to the database.
    """
    print("Starting news RSS scraping with SQLite + language detection...\n")

    # Connect to the SQLite database (creates file if not exists)
    conn = sqlite3.connect(DB_FILE)

    # Create the articles table
    init_db(conn)

    # Parse all RSS feeds and collect articles
    all_articles = []
    for feed in RSS_FEEDS:
        articles = parse_feed(feed)
        all_articles.extend(articles)

    # Save the articles into the database
    save_to_db(all_articles, conn)

    # Close the DB connection
    conn.close()
    print("\nAll done!")

# ------------------------- Run the Script -------------------------

if __name__ == "__main__":
    main()


Starting news RSS scraping with SQLite + language detection...

Fetching: CNN (USA)
Fetching: BBC (UK)
Fetching: NHK (Japan)
Fetching: Times of India (India)
Fetching: The Globe And Mail (Canada)
Fetching: ABC News (Australia)
Fetching: DW News (Germany)
Fetching: France24 (France)
Fetching: RT (Russia)
Fetching: Xinhua (China)
Fetching: CNA (Singapore)
Fetching: The Star (Malaysia)
Fetching: Korea Times (South Korea)
Fetching: Globo (Brazil)
Fetching: News24 (South Africa)
Fetching: Gulf News (UAE)
Fetching: Al Jazeera (Qatar)
Fetching: ANSA (Italy)
Fetching: El País (Spain)
Fetching: Jakarta Post (Indonesia)
Inserted 0 new articles.

All done!


In [8]:
from google.colab import files
files.download("news_articles.db")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import pandas as pd
# Connect to the SQLite database
conn = sqlite3.connect("news_articles.db")

# Load all articles into a pandas DataFrame
df = pd.read_sql_query("SELECT * FROM articles", conn)

# Show the first 10 rows
print(df.head(10))

# Close the connection
conn.close()

   id                                              title  \
0   1        Trump pleads not guilty to 34 felony counts   
1   2  Haberman reveals why Trump attacked judge and ...   
2   3  What to know about the Trump indictment on the...   
3   4  READ: Trump indictment related to hush money p...   
4   5  Russian authorities detain suspect over St. Pe...   
5   6  Video shows moment of deadly explosion at cafe...   
6   7  Wall Street Journal reporter Evan Gershkovich ...   
7   8  Suspected Chinese spy balloon was able to tran...   
8   9  Beijing promised to 'fight back' over Taiwan l...   
9  10  'Scary, cold, hungry and lonely': Volunteer so...   

                       published source country  \
0                                   CNN     USA   
1  Wed, 05 Apr 2023 13:30:09 GMT    CNN     USA   
2  Wed, 05 Apr 2023 22:50:06 GMT    CNN     USA   
3                                   CNN     USA   
4                                   CNN     USA   
5  Tue, 04 Apr 2023 01:08:44 GMT 

In [11]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("news_articles.db")

# Query the database
df = pd.read_sql_query("SELECT * FROM articles", conn)

# Print entire DataFrame as a clean table
pd.set_option("display.max_rows", 100)         # Adjust max rows shown
pd.set_option("display.max_columns", None)     # Show all columns
pd.set_option("display.width", None)           # Don't truncate wide tables
pd.set_option("display.colheader_justify", "left")

print(df.head(10))  # Or df.to_string() for full display
conn.close()


   id title                                               \
0   1        Trump pleads not guilty to 34 felony counts   
1   2  Haberman reveals why Trump attacked judge and ...   
2   3  What to know about the Trump indictment on the...   
3   4  READ: Trump indictment related to hush money p...   
4   5  Russian authorities detain suspect over St. Pe...   
5   6  Video shows moment of deadly explosion at cafe...   
6   7  Wall Street Journal reporter Evan Gershkovich ...   
7   8  Suspected Chinese spy balloon was able to tran...   
8   9  Beijing promised to 'fight back' over Taiwan l...   
9  10  'Scary, cold, hungry and lonely': Volunteer so...   

  published                      source country  \
0                                 CNN    USA      
1  Wed, 05 Apr 2023 13:30:09 GMT  CNN    USA      
2  Wed, 05 Apr 2023 22:50:06 GMT  CNN    USA      
3                                 CNN    USA      
4                                 CNN    USA      
5  Tue, 04 Apr 2023 01:08:44 GMT 

In [12]:
pip install tabulate



In [14]:
from tabulate import tabulate

# Print with tabulate
print(tabulate(df.head(700), headers='keys', tablefmt='grid'))

+-----+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+----------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------