In [3]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [48]:
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
import hashlib
import os
from requests.exceptions import RequestException
import logging
import time
import random

In [49]:
DB_DIR = r"C:/Users/HP/Documents/repos/news_ingestion_data_pipeline/data"
DB_FILE = "articles.db"
DB_PATH = os.path.join(DB_DIR, DB_FILE)

In [50]:
def get_latest_news_time():
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.execute("SELECT MAX(News_published_time) FROM articles")
        result = cursor.fetchone()
        if result[0]:
            incremental = True
            latest_timestamp = result[0]
        else:
            incremental = False
            latest_timestamp = '2025-07-25T00:00:00'

        latest_timestamp = datetime.fromisoformat(latest_timestamp).replace(hour=0, minute=0, second=0, microsecond=0)
        return (latest_timestamp, incremental)
    
def upsert_articles(filtered_articles):
    with sqlite3.connect(DB_PATH) as conn:
        for article in filtered_articles:
            sql = """
            INSERT INTO articles (Article_id,  News_link, News_title, Author_name, News_published_time, Source_name, Processed_at)
            VALUES (?, ?,  ?, ?, ?, ?, CURRENT_TIMESTAMP)
            ON CONFLICT(Article_id) DO UPDATE SET
                News_link = excluded.News_link,
                News_title = excluded.News_title,
                Author_name = excluded.Author_name,
                News_published_time = excluded.News_published_time,
                Source_name = excluded.Source_name,
                Processed_at = CURRENT_TIMESTAMP
            """
            params = (
                article.get('Article_id'),
                article.get('News_link'),
                article.get('News_title'),
                article.get('Author_name'),
                article.get('News_published_time'),
                article.get('Source_name')
            )
            cursor = conn.cursor()
            cursor.execute(sql, params)

def query_topn_articles(n = 5):
    # Connect to the SQLite database
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()

        # Execute a query to select all articles
        cursor.execute(f"""SELECT Article_id, News_link, News_title,Author_name, News_published_time, Source_name, Processed_at FROM articles
                        ORDER BY News_published_time DESC LIMIT {n}""")

        # Fetch all rows returned by the query
        rows = cursor.fetchall()
        # Process and display results
        for row in rows:
            print(row)


In [56]:
from PhocusWire.PhocusWireSource import PhocuswireScraper
from Skift.SkiftSource import SkiftScraper

In [58]:
if __name__ == "__main__":
    latest_timestamp, is_incremental = get_latest_news_time()
    print(f"{latest_timestamp}")
    if is_incremental:
        print("Initiating increment load...")
        print("Latest record time stamp present in database : ",latest_timestamp)
    else:
        print("Latest record timestamp not found in database.")
        print("Initiating full load...")
    extracted_articles = []
    
    Skriftscraper = SkiftScraper()
    skift_articles = Skriftscraper.extract_articles(latest_timestamp)
    extracted_articles.extend(skift_articles)
    Phocuswirescraper = PhocuswireScraper()
    phocuswire_articles = Phocuswirescraper.extract_articles(latest_timestamp)
    extracted_articles.extend(phocuswire_articles)

    print("Total articles extracted : ", len(extracted_articles))

    upsert_articles(extracted_articles)

  


2025-08-01 00:00:00
Initiating increment load...
Latest record time stamp present in database :  2025-08-01 00:00:00


NameError: name 'generate_article_id' is not defined

In [256]:
query_topn_articles(18)

('b6180012cdfcaab01451bded2196d26c', 'https://skift.com/2025/08/01/from-concur-to-spotnana-steve-singh-on-how-ai-could-fix-corporate-travel/', 'From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel', "Sean O'Neill", '2025-08-01T17:19:21', 'Skift', '2025-08-02 21:18:26')
('d030436466546bcf23aa4befbf4d08b6', 'https://skift.com/2025/08/01/delta-says-it-will-not-use-ai-to-target-customers/', 'Delta Responds to AI-Pricing Backlash: No ‘Individualized Prices Based on Personal Data’', 'Meghna Maharishi', '2025-08-01T15:05:14', 'Skift', '2025-08-02 21:18:26')
('8db7ffa1ba14adc8dd8348e7f5d1923d', 'https://skift.com/2025/08/01/u-s-dollar-slide-hurts-accor-minor-and-melia/', 'U.S. Dollar Slide Hurts Accor, Minor, and Meliá', 'Luke Martin', '2025-08-01T13:53:29', 'Skift', '2025-08-02 21:18:26')
('3fa30e2c6eee18976f224053633c1a27', 'https://skift.com/2025/08/01/winners-losers-and-lots-of-premium-seats-europes-airline-scorecard/', 'Winners, Losers, and Lots of Premium Seats: Euro

In [254]:
with sqlite3.connect(DB_PATH) as conn:
    cursor = conn.cursor()

    # Execute a query to select all articles
    cursor.execute("""SELECT Count(*) FROM articles
                    """)

    # Fetch all rows returned by the query
    rows = cursor.fetchall()
    # Process and display results
    for row in rows:
        print(row)

(97,)
