In [1]:
%pip install nltk googlenews newspaper3k requests lxml_html_clean 
%pip install lxml beautifulsoup4

Collecting googlenews
  Downloading GoogleNews-1.6.15-py3-none-any.whl.metadata (4.5 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting dateparser (from googlenews)
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━

In [2]:
import logging
import csv
import re
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from datetime import datetime
import nltk
from GoogleNews import GoogleNews
from newspaper import Article, Config

# Ensure punkt tokenizer is available
# For production use, consider including punkt with dependencies instead of downloading at runtime


In [3]:

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    raise LookupError("The NLTK 'punkt' tokenizer is missing. Please install it using nltk.download('punkt') during setup.")

logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s: %(message)s', level=logging.DEBUG)
logger = logging.getLogger('NewsScraper')

# Configure HTTP session with retries
USER_AGENT = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
              'AppleWebKit/537.36 (KHTML, like Gecko) '
              'Chrome/50.0.2661.102 Safari/537.36')
session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Newspaper config
np_config = Config()
np_config.browser_user_agent = USER_AGENT
np_config.request_timeout = 10

# Constants
DEFAULT_MAX_PAGES = 5
CSV_MAX_SIZE = 5 * 1024 * 1024
BATCH_SIZE = 10
FIELDNAMES = ['Title','Slug','Excerpt','Content','Article','Summary','Image Featured','Format','Date','Categories','Tags']
CATEGORY_KEYWORDS = {
    'Entertainment':['movie','music','show'],
    'Sports':['sport','game','match'],
    'Business and Finance':['market','finance','business'],
    'Health and Foods':['health','food','wellness'],
    'Life Style':['lifestyle','travel'],
    'Politics':['politics','election','government'],
    'World News':['world','international'],
    'Technology':['tech','software','internet'],
    'Travel-  Life Style':['travel','vacation'],
    'Flower':['flower','floral'],
    'Fashion':['fashion','runway'],
    'Uncategorized':[]
}

In [4]:
def slugify(text: str) -> str:
    slug = re.sub(r"-+", '-', re.sub(r"[^a-z0-9]+", '-', text.lower())).strip('-')
    logger.debug(f"Slugified '{text}' to '{slug}'")
    return slug

def assign_category(title: str, tags: list) -> str:
    combined = (title + ' ' + ' '.join(tags)).lower()
    for cat, kws in CATEGORY_KEYWORDS.items():
        if any(kw in combined for kw in kws):
            logger.debug(f"Assigned category '{cat}' for title '{title}'")
            return cat
    logger.debug(f"Assigned default category 'Uncategorized' for title '{title}'")
    return 'Uncategorized'

class RotatingCSVWriter:
    def __init__(self, base_name: str):
        self.base = base_name
        self.index = 1
        self.buffer = []
        self._open()
    def _open(self):
        if hasattr(self,'file'):
            logger.debug(f"Closing file {self.file.name}")
            self._flush()
            self.file.close()
        fname = f"{self.base}_{self.index}.csv"
        logger.debug(f"Opening CSV file: {fname}")
        self.file = open(fname,'w',encoding='utf-8',newline='')
        self.writer = csv.DictWriter(self.file, fieldnames=FIELDNAMES)
        self.writer.writeheader()
    def write(self,row:dict):
        logger.debug(f"Buffering row with Title: {row.get('Title')} to {self.file.name}")
        self.buffer.append(row)
        if len(self.buffer) >= BATCH_SIZE:
            self._flush()
    def _flush(self):
        logger.debug(f"Flushing {len(self.buffer)} rows to disk")
        for row in self.buffer:
            self.writer.writerow(row)
        self.file.flush()
        self.buffer = []
        size = os.path.getsize(self.file.name)
        logger.debug(f"Current file size: {size} bytes")
        if size >= CSV_MAX_SIZE:
            logger.debug(f"File size exceeded {CSV_MAX_SIZE}, rotating file")
            self.index += 1
            self._open()
    def close(self):
        logger.debug(f"Closing final file {self.file.name}")
        if self.buffer:
            self._flush()
        self.file.close()

def fetch_articles_for_category(category: str, year: int, pages: int, writer: RotatingCSVWriter):
    logger.debug(f"Fetching category '{category}' for year {year}")
    googlenews = GoogleNews(lang='en', region='US')
    query = f"{category} after:{year}-01-01 before:{year+1}-01-01"
    logger.debug(f"GoogleNews query: {query}")
    googlenews.search(query)
    seen = set()
    article_parser = Article('', config=np_config)
    for p in range(1,pages+1):
        if p>1:
            logger.debug(f"Fetching GoogleNews page {p} for category '{category}'")
            googlenews.getpage(p)
        results = googlenews.results()
        logger.debug(f"Retrieved {len(results)} results on page {p}")
        if not results:
            break
        for res in results:
            title = res.get('title','').strip()
            logger.debug(f"Processing result title: {title}")
            if not title or title in seen:
                continue
            seen.add(title)
            link = res.get('link','').split('&')[0]
            logger.debug(f"Using link: {link}")
            excerpt = res.get('desc','').strip()
            try:
                resp = session.get(link,timeout=10)
                resp.raise_for_status()
                html = resp.text
                logger.debug(f"Prefetched HTML for {link}")
            except Exception as e:
                logger.debug(f"Failed to prefetch {link}: {e}")
                continue
            slug = slugify(title)
            tags=[]
            date_str = res.get('date','').strip()
            content=excerpt
            article_text='' ; summary='' ; image=''
            try:
                article_parser.set_url(link)
                article_parser.download(input_html=html)
                article_parser.parse()
                article_parser.nlp()
                article_text = article_parser.text
                summary = article_parser.summary
                image = article_parser.top_image or ''
                tags = article_parser.keywords or []
                if article_parser.publish_date:
                    date_str = article_parser.publish_date.strftime('%Y-%m-%d %H:%M:%S')
                logger.debug(f"Parsed article '{title}' with {len(tags)} tags")
            except Exception as e:
                logger.debug(f"Article parsing failed for '{title}': {e}")
            cat_assigned = assign_category(title, tags)
            row = {
                'Title':title, 'Slug':slug, 'Excerpt':excerpt, 'Content':content,
                'Article':article_text, 'Summary':summary, 'Image Featured':image,
                'Format':'standard', 'Date':date_str, 'Categories':cat_assigned,
                'Tags':'|'.join(tags)
            }
            writer.write(row)
        googlenews.clear()


In [5]:
year=2025
pages="10000"
pages=int(pages) if pages.isdigit() else DEFAULT_MAX_PAGES
logger.debug(f"Starting main with year={year}, pages={pages}")
writer=RotatingCSVWriter(f"news_{year}")
for cat in CATEGORY_KEYWORDS:
    fetch_articles_for_category(cat,year,pages,writer)
writer.close()
logger.debug("All categories processed, exiting main")


HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
HTTP Error 429: Too Many Requests
