V1

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from newspaper import Article
import nltk
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse

# -------------------------------------------------------
# NLTK download
# -------------------------------------------------------
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# -------------------------------------------------------
# KEYWORD LISTS
# -------------------------------------------------------
GBV_KEYWORDS = [
    'gender based violence', 'gbv', 'domestic violence',
    'sexual harassment', 'women abuse', 'femicide'
]
CYBERBULLYING_KEYWORDS = [
    'cyber bullying', 'online harassment', 'cyberbullying',
    'internet trolling', 'social media abuse'
]
SCAMS_KEYWORDS = [
    'scam', 'fraud', 'online scam', 'phishing',
    'cyber fraud', 'investment scam'
]

# -------------------------------------------------------
# CATEGORIZATION
# -------------------------------------------------------
def categorize_article(text):
    if not text:
        return "Other"

    sentences = sent_tokenize(text)
    scores = {"GBV": 0, "Cyberbullying": 0, "Scams": 0}

    for s in sentences:
        s = s.lower()
        if any(k in s for k in GBV_KEYWORDS): scores["GBV"] += 1
        if any(k in s for k in CYBERBULLYING_KEYWORDS): scores["Cyberbullying"] += 1
        if any(k in s for k in SCAMS_KEYWORDS): scores["Scams"] += 1

    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other"

# -------------------------------------------------------
# LINK EXTRACTION
# -------------------------------------------------------
def extract_article_links(html, base_url, max_links=5):
    soup = BeautifulSoup(html, "html.parser")
    links = set()

    selectors = [
        'article a[href]',
        'h2 a[href]', 'h3 a[href]',
        '.story a[href]', '.news-item a[href]',
        'a[href*="/news/"]', 'a[href*="/article/"]', 'a[href*="/story/"]'
    ]

    for selector in selectors:
        for elem in soup.select(selector)[:max_links * 2]:
            href = elem.get("href")
            if not href:
                continue

            full = urljoin(base_url, href)

            if urlparse(full).netloc == urlparse(base_url).netloc:
                links.add(full)
                if len(links) >= max_links:
                    return list(links)

    return list(links)

# -------------------------------------------------------
# DATE CHECK
# -------------------------------------------------------
def is_recent(date, days=30):
    if not date:
        return True
    return (datetime.now() - date).days <= days

# -------------------------------------------------------
# MAIN SCRAPER
# -------------------------------------------------------
def main():

    site_urls = [
        'https://www.the-star.co.ke/',
        'https://www.tuko.co.ke/'
    ]

    max_articles = 5
    data = []

    # ---------------------------------------------------
    # Selenium Setup (FIXED)
    # ---------------------------------------------------
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("start-maximized")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    # ---------------------------------------------------
    # Scraping Loop
    # ---------------------------------------------------
    for site in site_urls:
        print(f"\nüîµ Loading site: {site}")

        try:
            driver.get(site)
            time.sleep(3)

            html = driver.page_source
            article_links = extract_article_links(html, site, max_articles)

            print(f" ‚Üí Found {len(article_links)} article links")

            for url in article_links:
                try:
                    print(f"   üì∞ Processing: {url}")

                    article = Article(url)
                    article.download()
                    article.parse()

                    if not is_recent(article.publish_date):
                        print("     ‚è≥ Old article ‚Üí Skipped")
                        continue

                    text = article.text
                    category = categorize_article(text)

                    data.append({
                        "site_url": site,
                        "article_url": url,
                        "title": article.title,
                        "publish_date": article.publish_date.strftime("%Y-%m-%d") if article.publish_date else "Unknown",
                        "category": category,
                        "summary_snippet": text[:200] + "..." if len(text) > 200 else text
                    })

                    time.sleep(1)

                except Exception as e:
                    print(f"     ‚ùå Article parsing error: {e}")
                    continue

        except Exception as e:
            print(f"‚ùå Site loading error: {e}")
            continue

    driver.quit()

    # ---------------------------------------------------
    # SAVE RESULTS
    # ---------------------------------------------------
    if data:
        df = pd.DataFrame(data)
        df.to_csv("articles_from_sites.csv", index=False, encoding="utf-8")
        print(f"\n‚úÖ Saved {len(data)} articles to articles_from_sites.csv")
        print(df.head())
    else:
        print("\n‚ö† No articles scraped.")

# Run
main()



üîµ Loading site: https://www.the-star.co.ke/
 ‚Üí Found 5 article links
   üì∞ Processing: https://www.the-star.co.ke/video/2025-11-17-ps-bitok-kjsea-results-to-be-released-by-december-11
   üì∞ Processing: https://www.the-star.co.ke/opinion/star-blogs/2025-11-16-odm-must-find-its-ideological-soul-to-survive-beyond-raila
   üì∞ Processing: https://www.the-star.co.ke/news/infographics/2025-11-17-kenyas-acute-food-insecurity-outlook
   üì∞ Processing: https://www.the-star.co.ke/business/kenya/2025-11-17-senate-taxpayers-association-to-address-gaps-in-agriculture-governance
   üì∞ Processing: https://www.the-star.co.ke/video

üîµ Loading site: https://www.tuko.co.ke/
 ‚Üí Found 0 article links

‚úÖ Saved 5 articles to articles_from_sites.csv
                      site_url  \
0  https://www.the-star.co.ke/   
1  https://www.the-star.co.ke/   
2  https://www.the-star.co.ke/   
3  https://www.the-star.co.ke/   
4  https://www.the-star.co.ke/   

                                     

V 2- Database

In [1]:
# =====================================================================
# Enhanced Article Scraper with PostgreSQL, Elasticsearch, and Celery
# =====================================================================
# This script adapts the original scraper to:
# - Use PostgreSQL (via SQLAlchemy) for structured storage.
# - Use Elasticsearch for full-text search indexing.
# - Use Celery with Redis for asynchronous tasking (scraping sites in background tasks).
#
# Assumptions:
# - PostgreSQL DB: Create a database named 'articles_db' (or update DATABASE_URL). Ensure server is running on localhost:5432.
# - Elasticsearch: Running on localhost:9200 (update ES_URL if needed).
# - Redis: Running on localhost:6379 (Celery broker).
# - In Jupyter Notebook (PyCharm): Run cells sequentially. First setup cell (including NLTK download),
#   then start Celery worker (in terminal: celery -A scraper worker --loglevel=info),
#   then queue tasks.
#
# Usage in Jupyter:
# 1. Run setup (imports, models, Celery app) - this creates tables if DB connected.
# 2. NLTK download happens automatically in setup.
# 3. In a separate terminal in PyCharm: Activate env and run `celery -A your_notebook_module worker --loglevel=info` (save as .py or use module name).
# 4. Queue tasks: results = main()
# 5. Monitor worker logs; query DB/ES as needed.
#
# Fixed Issues:
# - SQLAlchemy 2.0+ deprecation: Updated declarative_base import.
# - DB Model: publish_date as String (matches stored format; avoids parsing errors for "Unknown").
# - ES Search: Fixed bool query structure for category filtering.
# - Added NLTK download back.
# - Minor: Ensured publish_date handling in task (None for Unknown in model if needed, but String now).
# - Tested non-external parts (e.g., categorization logic) mentally; full run requires local services.
# =====================================================================

# -------------------------------------------------------
# IMPORTS
# -------------------------------------------------------
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from newspaper import Article
import nltk
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse

# New imports for DB, ES, Celery
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import declarative_base, sessionmaker  # Fixed: Use orm.declarative_base for SQLAlchemy 2.0+
from elasticsearch import Elasticsearch
from celery import Celery

# -------------------------------------------------------
# NLTK DOWNLOAD (added back)
# -------------------------------------------------------
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# -------------------------------------------------------
# CONFIGURATION
# -------------------------------------------------------
# Database (PostgreSQL) - UPDATE WITH YOUR ACTUAL CREDS!
DATABASE_URL = "postgresql://postgres:your_password@localhost/articles_db"  # e.g., postgres:pass123@localhost/articles_db
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

# Elasticsearch
ES_URL = "http://localhost:9200"
es_client = Elasticsearch([ES_URL])

# Celery (Redis broker)
celery_app = Celery('scraper', broker='redis://localhost:6379/0', backend='redis://localhost:6379/0')
celery_app.conf.update(
    task_serializer='json',
    accept_content=['json'],
    result_serializer='json',
    timezone='Africa/Nairobi',
    enable_utc=True,
)

# -------------------------------------------------------
# DATABASE MODEL (publish_date as String to match stored format)
# -------------------------------------------------------
class Article(Base):
    __tablename__ = "articles"

    id = Column(Integer, primary_key=True, index=True)
    site_url = Column(String, index=True)
    article_url = Column(String, unique=True, index=True)
    title = Column(String)
    publish_date = Column(String(50))  # Changed to String for "%Y-%m-%d" or "Unknown"
    category = Column(String)
    summary_snippet = Column(Text)
    created_at = Column(DateTime, default=datetime.utcnow)

# Create tables (run once) - Will fail if DB not connected/running!
Base.metadata.create_all(bind=engine)
print("‚úÖ Tables created (or already exist).")

# -------------------------------------------------------
# KEYWORD LISTS (unchanged)
# -------------------------------------------------------
GBV_KEYWORDS = [
    'gender based violence', 'gbv', 'domestic violence', 'sexual harassment', 'women abuse', 'femicide'
]
CYBERBULLYING_KEYWORDS = [
    'cyber bullying', 'online harassment', 'cyberbullying', 'internet trolling', 'social media abuse'
]
SCAMS_KEYWORDS = [
    'scam', 'fraud', 'online scam', 'phishing', 'cyber fraud', 'investment scam'
]

# -------------------------------------------------------
# CATEGORIZATION (unchanged)
# -------------------------------------------------------
def categorize_article(text):
    if not text:
        return "Other"
    sentences = sent_tokenize(text)
    scores = {"GBV": 0, "Cyberbullying": 0, "Scams": 0}
    for s in sentences:
        s = s.lower()
        if any(k in s for k in GBV_KEYWORDS):
            scores["GBV"] += 1
        if any(k in s for k in CYBERBULLYING_KEYWORDS):
            scores["Cyberbullying"] += 1
        if any(k in s for k in SCAMS_KEYWORDS):
            scores["Scams"] += 1
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other"

# -------------------------------------------------------
# LINK EXTRACTION (unchanged)
# -------------------------------------------------------
def extract_article_links(html, base_url, max_links=5):
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    selectors = [
        'article a[href]', 'h2 a[href]', 'h3 a[href]', '.story a[href]', '.news-item a[href]',
        'a[href*="/news/"]', 'a[href*="/article/"]', 'a[href*="/story/"]'
    ]
    for selector in selectors:
        for elem in soup.select(selector)[:max_links * 2]:
            href = elem.get("href")
            if not href:
                continue
            full = urljoin(base_url, href)
            if urlparse(full).netloc == urlparse(base_url).netloc:
                links.add(full)
            if len(links) >= max_links:
                return list(links)
    return list(links)

# -------------------------------------------------------
# DATE CHECK (unchanged)
# -------------------------------------------------------
def is_recent(date, days=30):
    if not date:
        return True
    return (datetime.now() - date).days <= days

# -------------------------------------------------------
# SAVE TO POSTGRESQL
# -------------------------------------------------------
def save_to_db(article_data):
    db = SessionLocal()
    try:
        # Ensure publish_date is str
        if article_data["publish_date"] == "Unknown":
            article_data["publish_date"] = None  # Optional: Set to None for cleaner data
        db_article = Article(**article_data)
        db.add(db_article)
        db.commit()
        db.refresh(db_article)
        return db_article.id
    except Exception as e:
        db.rollback()
        print(f"‚ùå DB Save error: {e}")
        return None
    finally:
        db.close()

# -------------------------------------------------------
# INDEX TO ELASTICSEARCH
# -------------------------------------------------------
def index_to_es(article_data, db_id):
    doc = {
        "id": db_id,
        "site_url": article_data["site_url"],
        "article_url": article_data["article_url"],
        "title": article_data["title"],
        "publish_date": article_data["publish_date"],
        "category": article_data["category"],
        "summary_snippet": article_data["summary_snippet"],
        "created_at": datetime.utcnow().isoformat()
    }
    try:
        es_client.index(index="articles", id=db_id, body=doc)
        print(f"‚úÖ Indexed to ES: {article_data['title'][:50]}...")
    except Exception as e:
        print(f"‚ùå ES Index error: {e}")

# -------------------------------------------------------
# CELERY TASK: SCRAPE SINGLE SITE
# -------------------------------------------------------
@celery_app.task(bind=True)
def scrape_site_task(self, site_url, max_articles=5):
    print(f"üîµ Task started: Scraping {site_url}")

    # Selenium Setup
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("start-maximized")
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    try:
        driver.get(site_url)
        time.sleep(3)
        html = driver.page_source
        article_links = extract_article_links(html, site_url, max_articles)
        print(f" ‚Üí Found {len(article_links)} article links")

        processed_count = 0
        for url in article_links:
            try:
                print(f" üì∞ Processing: {url}")
                article = Article(url)
                article.download()
                article.parse()

                if not is_recent(article.publish_date):
                    print(" ‚è≥ Old article ‚Üí Skipped")
                    continue

                text = article.text
                category = categorize_article(text)

                publish_date_str = article.publish_date.strftime("%Y-%m-%d") if article.publish_date else "Unknown"
                article_data = {
                    "site_url": site_url,
                    "article_url": url,
                    "title": article.title,
                    "publish_date": publish_date_str,
                    "category": category,
                    "summary_snippet": text[:200] + "..." if len(text) > 200 else text
                }

                # Save to PG
                db_id = save_to_db(article_data)
                if db_id:
                    # Index to ES
                    index_to_es(article_data, db_id)
                    processed_count += 1

                time.sleep(1)

            except Exception as e:
                print(f" ‚ùå Article parsing error: {e}")
                continue

        print(f"‚úÖ Task completed: Processed {processed_count} articles from {site_url}")
        return {"site": site_url, "processed": processed_count}

    except Exception as e:
        print(f"‚ùå Site loading error: {e}")
        raise self.retry(countdown=60)  # Retry after 1 min on failure
    finally:
        driver.quit()

# -------------------------------------------------------
# MAIN (for queuing tasks - run in Jupyter)
# -------------------------------------------------------
def main():
    site_urls = [
        'https://www.the-star.co.ke/',
        'https://www.tuko.co.ke/'
    ]
    max_articles = 5

    # Queue tasks asynchronously
    results = []
    for site in site_urls:
        result = scrape_site_task.delay(site, max_articles)
        results.append(result)

    print(f"üöÄ Queued {len(results)} tasks. Check Celery worker logs for progress.")
    return results

# -------------------------------------------------------
# QUERY EXAMPLES (for Jupyter) - Fixed ES search structure
# -------------------------------------------------------
def query_db(category=None):
    """Query articles from PostgreSQL."""
    db = SessionLocal()
    try:
        query = db.query(Article)
        if category:
            query = query.filter(Article.category == category)
        df = pd.read_sql(query.statement, engine)
        return df
    finally:
        db.close()

def search_es(query_str, category=None):
    """Search articles in Elasticsearch."""
    multi_match = {"multi_match": {"query": query_str, "fields": ["title", "summary_snippet"]}}

    if category:
        search_body = {
            "query": {
                "bool": {
                    "must": [multi_match],
                    "filter": [{"term": {"category": category}}]
                }
            }
        }
    else:
        search_body = {"query": multi_match}

    try:
        res = es_client.search(index="articles", body=search_body)
        return [hit["_source"] for hit in res["hits"]["hits"]]
    except Exception as e:
        print(f"‚ùå ES Search error: {e}")
        return []

# Example usage in Jupyter:
# results = main()  # Queues tasks
# df = query_db("GBV")  # After tasks complete
# es_results = search_es("scam", "Scams")
# print(df.head() if not df.empty else "No data yet.")

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

V3

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from newspaper import Article
import nltk
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse
from transformers import pipeline
import torch
import concurrent.futures
from functools import lru_cache

# -------------------------------------------------------
# NLTK download
# -------------------------------------------------------
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# -------------------------------------------------------
# KEYWORD LISTS (original for categorization)
# -------------------------------------------------------
GBV_KEYWORDS = [
    'gender based violence', 'gbv', 'domestic violence', 'sexual harassment', 'women abuse', 'femicide'
]
CYBERBULLYING_KEYWORDS = [
    'cyber bullying', 'online harassment', 'cyberbullying', 'internet trolling', 'social media abuse'
]
SCAMS_KEYWORDS = [
    'scam', 'fraud', 'online scam', 'phishing', 'cyber fraud', 'investment scam'
]

# -------------------------------------------------------
# CATEGORIZATION (original keyword-based)
# -------------------------------------------------------
def categorize_article(text):
    if not text:
        return "Other"
    sentences = sent_tokenize(text)
    scores = {"GBV": 0, "Cyberbullying": 0, "Scams": 0}
    for s in sentences:
        s = s.lower()
        if any(k in s for k in GBV_KEYWORDS):
            scores["GBV"] += 1
        if any(k in s for k in CYBERBULLYING_KEYWORDS):
            scores["Cyberbullying"] += 1
        if any(k in s for k in SCAMS_KEYWORDS):
            scores["Scams"] += 1
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other"

# -------------------------------------------------------
# LINK EXTRACTION (original, cached)
# -------------------------------------------------------
@lru_cache(maxsize=10)
def extract_article_links(html, base_url, max_links=5):
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    selectors = [
        'article a[href]', 'h2 a[href]', 'h3 a[href]', '.story a[href]', '.news-item a[href]',
        'a[href*="/news/"]', 'a[href*="/article/"]', 'a[href*="/story/"]'
    ]
    for selector in selectors:
        for elem in soup.select(selector)[:max_links * 2]:
            href = elem.get("href")
            if not href:
                continue
            full = urljoin(base_url, href)
            if urlparse(full).netloc == urlparse(base_url).netloc:
                links.add(full)
            if len(links) >= max_links:
                return list(links)
    return list(links)

# -------------------------------------------------------
# DATE CHECK (original)
# -------------------------------------------------------
def is_recent(date, days=30):
    if not date:
        return True
    return (datetime.now() - date).days <= days

# -------------------------------------------------------
# ARTICLE PROCESSOR (for parallel execution)
# -------------------------------------------------------
def process_article(url, site_url, sentiment_pipeline, topic_pipeline, ner_pipeline):
    try:
        article = Article(url)
        article.download()
        article.parse()
        if not is_recent(article.publish_date):
            return None
        text = article.text
        if not text or len(text.split()) < 50:  # Skip very short texts for speed
            return None

        # Original keyword categorization
        category = categorize_article(text)

        # Truncate for models
        truncated_text = ' '.join(text.split()[:512])  # Token-aware truncate

        # Sentiment
        sentiment = "N/A"
        if sentiment_pipeline:
            sentiment_result = sentiment_pipeline(truncated_text)
            sentiment = sentiment_result[0]['label']

        # Topic (zero-shot, lighter model)
        topic = "N/A"
        if topic_pipeline:
            candidate_labels = [
                "hate speech", "scam", "gender based violence", "cyber bullying",
                "high risk crime location", "neutral"
            ]
            topic_result = topic_pipeline(truncated_text, candidate_labels=candidate_labels)
            topic = topic_result['labels'][0]

        # NER (locations/orgs/misc)
        ner_entities = "[]"
        if ner_pipeline:
            ner_result = ner_pipeline(truncated_text)
            relevant_entities = [ent['word'] for ent in ner_result if ent['entity_group'] in ['LOC', 'ORG', 'MISC']]
            ner_entities = str(relevant_entities)

        return {
            "site_url": site_url,
            "article_url": url,
            "title": article.title,
            "publish_date": article.publish_date.strftime("%Y-%m-%d") if article.publish_date else "Unknown",
            "category": category,
            "sentiment": sentiment,
            "topic": topic,
            "ner_entities": ner_entities,
            "summary_snippet": text[:200] + "..." if len(text) > 200 else text
        }
    except Exception as e:
        print(f" ‚ùå Article processing error for {url}: {e}")
        return None

# -------------------------------------------------------
# MAIN SCRAPER
# -------------------------------------------------------
def main():
    site_urls = [
        'https://www.standardmedia.co.ke/',
        'https://www.the-star.co.ke/',
        'https://www.tuko.co.ke/'
    ]
    max_articles = 5
    data = []

    # ---------------------------------------------------
    # Model Pipelines (loaded once for efficiency)
    # ---------------------------------------------------
    print("üîÑ Loading ML models... This may take a few minutes.")
    device = 0 if torch.cuda.is_available() else -1
    print(f"üì± Using device: {'GPU' if device == 0 else 'CPU'}")

    sentiment_pipeline = None
    try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            device=device
        )
        print("‚úÖ Sentiment model loaded.")
    except Exception as e:
        print(f"‚ùå Error loading sentiment model: {e}")

    ner_pipeline = None
    try:
        ner_pipeline = pipeline(
            "ner",
            model="Davlan/bert-base-multilingual-cased-ner-hrl",
            aggregation_strategy="simple",
            device=device
        )
        print("‚úÖ NER model loaded.")
    except Exception as e:
        print(f"‚ùå Error loading NER model: {e}")

    topic_pipeline = None
    try:
        # Lighter zero-shot model for speed
        topic_pipeline = pipeline(
            "zero-shot-classification",
            model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
            device=device
        )
        print("‚úÖ Topic model loaded.")
    except Exception as e:
        print(f"‚ùå Error loading topic model: {e}")

    # ---------------------------------------------------
    # Selenium Setup (with implicit wait for speed)
    # ---------------------------------------------------
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")  # For stability
    options.add_argument("--disable-extensions")
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    driver.implicitly_wait(5)  # Faster dynamic waits

    # ---------------------------------------------------
    # Scraping Loop (with parallel article processing)
    # ---------------------------------------------------
    for site in site_urls:
        print(f"\nüîµ Loading site: {site}")
        try:
            driver.get(site)
            time.sleep(2)  # Reduced wait
            html = driver.page_source
            article_links = extract_article_links(html, site, max_articles)
            print(f" ‚Üí Found {len(article_links)} article links")

            # Parallel process articles
            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:  # Limit threads for stability
                futures = [
                    executor.submit(process_article, url, site, sentiment_pipeline, topic_pipeline, ner_pipeline)
                    for url in article_links
                ]
                for future in concurrent.futures.as_completed(futures):
                    result = future.result()
                    if result:
                        data.append(result)
                        category = result['category']
                        sentiment = result['sentiment']
                        topic = result['topic']
                        ner_entities = result['ner_entities'][:50] + "..." if len(result['ner_entities']) > 50 else result['ner_entities']
                        print(f" üìä Category: {category} | Sentiment: {sentiment} | Topic: {topic} | NER: {ner_entities}")

            time.sleep(0.5)  # Brief pause between sites
        except Exception as e:
            print(f"‚ùå Site loading error: {e}")

    driver.quit()

    # ---------------------------------------------------
    # SAVE RESULTS
    # ---------------------------------------------------
    if data:
        df = pd.DataFrame(data)
        df.to_csv("articles_from_sites.csv", index=False, encoding="utf-8")
        print(f"\n‚úÖ Saved {len(data)} articles to articles_from_sites.csv")
        print("\nüìã CSV Columns: site_url, article_url, title, publish_date, category, sentiment, topic, ner_entities, summary_snippet")
        print(df[['title', 'category', 'sentiment', 'topic', 'ner_entities']].head())
    else:
        print("\n‚ö† No articles scraped.")

if __name__ == "__main__":
    main()