In [1]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
import hashlib
import os


In [3]:
DB_DIR = r"C:/Users/HP/Documents/repos/news_ingestion_data_pipeline/data"
DB_FILE = "articles.db"
DB_PATH = os.path.join(DB_DIR, DB_FILE)

In [99]:
def get_latest_news_time():
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.execute("SELECT MAX(News_published_time) FROM articles")
        result = cursor.fetchone()
        if result[0]:
            incremental = True
            latest_timestamp = result[0]
        else:
            incremental = False
            latest_timestamp = '2025-07-25T00:00:00'
        return (latest_timestamp, incremental)
    
get_latest_news_time()

('2025-08-01T17:19:21', True)

In [100]:
def parse_date(date_str, source= 'skift'):
    """
    Converts date string like 'July 28, 2025' to datetime object.
    Return None if parse fails.
    """
    try:
        if source == "Phocusewire":
            return datetime.strptime(date_str.strip(), "%B %d, %Y")
        else:
            return datetime.fromisoformat(date_str)
    except Exception:
        return None
    
def generate_article_id(url):
    return hashlib.md5(url.encode("utf-8")).hexdigest()



def datetime_to_iso_with_time(dt):
    """
    Convert a datetime object to ISO8601 string with a fixed time part.

    Args:
        dt (datetime): A datetime object (date part used).

    Returns:
        str: Formatted ISO8601 string in 'YYYY-MM-DDTHH:MM:SS' format
    """
    date_part = dt.strftime("%Y-%m-%d")
    time_str = dt.strftime("%H:%M:%S")
    return f"{date_part}T{time_str}"

In [119]:
date_str = "2025-07-29T11:21:16-04:00"
def drop_timezone(date_str):
    date_str = datetime.fromisoformat(date_str)
    return date_str.strftime("%Y-%m-%dT%H:%M:%S")

drop_timezone(date_str)

'2025-07-29T11:21:16'

In [120]:
ts = drop_timezone("2025-07-29T11:21:16-04:00")
print(ts)
ts_parsed = parse_date(ts)
print(ts_parsed)
datetime_to_iso_with_time(ts_parsed)

2025-07-29T11:21:16
2025-07-29 11:21:16


'2025-07-29T11:21:16'

In [121]:
ts = "August 1, 2025"
print(ts)
ts_parsed = parse_date(ts,"Phocusewire")
print(ts_parsed)
datetime_to_iso_with_time(ts_parsed)

August 1, 2025
2025-08-01 00:00:00


'2025-08-01T00:00:00'

In [103]:
def skift_web_scraping(last_ingested_date):
    base_url = f"https://skift.com/news/"
    page = 1
    max_page = 15
    collected_articles = []
    last_ingested_date = parse_date(last_ingested_date)
    while True:
        url = f"{base_url}page/{page}/"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Stopping due to bad status: {response.status_code} at page {page}")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("article")

        if not articles:
            print(f"No articles found on page {page}, stopping.")
            break

        stop_paging = False

        for article in articles:
            link_tag = article.select_one("h3.c-tease__title a")
            news_url = link_tag['href'] if link_tag else None
            article_id = generate_article_id(news_url)
            headline = link_tag.text.strip() if link_tag else None
            print(headline)
            author_tag = article.select_one("div.c-tease__byline a.underline")
            author_name = author_tag.text.strip() if author_tag else None

            time_tag = article.select_one("div.c-tease__byline time")
            news_time = drop_timezone(time_tag.get("datetime")) if time_tag else None
            news_time = parse_date(news_time)

            if news_time:
                # If last_ingested_date is set and this article is older or equal, stop ingestion
                print("News time : ",news_time)
                print("last -ngested time: ", last_ingested_date)
                print("-"*40)
                if last_ingested_date and news_time < last_ingested_date:
                    stop_paging = True
                    print(f"Encountered article dated {news_time} < last ingested {last_ingested_date}, stopping.")
                    break
            else:
                # If no date found, you can decide to skip or include
                print("Article without date found, skipping date check.")

            collected_articles.append({"Article_id":article_id,"News_title":headline,"News_link":news_url,"Author_name":author_name,"News_published_time":datetime_to_iso_with_time(news_time), "Source_name": "Skift"})

        if stop_paging:
            break

        page += 1
        print(page)
    print("No. of new articles ingested : ", len(collected_articles))
    return collected_articles

In [104]:
latest_timestamp = get_latest_news_time()
print(latest_timestamp)
extracted_articles = skift_web_scraping(latest_timestamp[0])

('2025-08-01T17:19:21', True)
From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel
News time :  2025-08-01 17:19:21
last -ngested time:  2025-08-01 17:19:21
----------------------------------------
Delta Responds to AI-Pricing Backlash: No ‘Individualized Prices Based on Personal Data’
News time :  2025-08-01 15:05:14
last -ngested time:  2025-08-01 17:19:21
----------------------------------------
Encountered article dated 2025-08-01 15:05:14 < last ingested 2025-08-01 17:19:21, stopping.
No. of new articles ingested :  1


In [105]:
for article in extracted_articles:
    key_list = list(article.keys())
    for key in key_list:
        if article[key] is None:
            print(article)


In [106]:
for article in extracted_articles:
    print(article)

{'Article_id': 'b6180012cdfcaab01451bded2196d26c', 'News_title': 'From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel', 'News_link': 'https://skift.com/2025/08/01/from-concur-to-spotnana-steve-singh-on-how-ai-could-fix-corporate-travel/', 'Author_name': "Sean O'Neill", 'News_published_time': '2025-08-01T17:19:21', 'Source_name': 'Skift'}


In [107]:
def upsert_articles(filtered_articles):

    with sqlite3.connect(DB_PATH) as conn:
        for article in filtered_articles:
            sql = """
            INSERT INTO articles (Article_id,  News_link, News_title, Author_name, News_published_time, Source_name, Processed_at)
            VALUES (?, ?,  ?, ?, ?, ?, CURRENT_TIMESTAMP)
            ON CONFLICT(Article_id) DO UPDATE SET
                News_link = excluded.News_link,
                News_title = excluded.News_title,
                Author_name = excluded.Author_name,
                News_published_time = excluded.News_published_time,
                Source_name = excluded.Source_name,
                Processed_at = CURRENT_TIMESTAMP
            """
            params = (
                article.get('Article_id'),
                article.get('News_link'),
                article.get('News_title'),
                article.get('Author_name'),
                article.get('News_published_time'),
                article.get('Source_name'),
            )
            print(params)
            cursor = conn.cursor()
            cursor.execute(sql, params)

In [108]:
upsert_articles(extracted_articles)

('b6180012cdfcaab01451bded2196d26c', 'https://skift.com/2025/08/01/from-concur-to-spotnana-steve-singh-on-how-ai-could-fix-corporate-travel/', 'From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel', "Sean O'Neill", '2025-08-01T17:19:21', 'Skift')


In [109]:
def query_top5_articles():
    # Connect to the SQLite database
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()

        # Execute a query to select all articles
        cursor.execute("""SELECT Article_id, News_link, News_title,Author_name, News_published_time, Source_name, Processed_at FROM articles
                        ORDER BY News_published_time DESC LIMIT 10""")

        # Fetch all rows returned by the query
        rows = cursor.fetchall()
        # Process and display results
        for row in rows:
            print(row)

In [110]:
query_top5_articles()

('b6180012cdfcaab01451bded2196d26c', 'https://skift.com/2025/08/01/from-concur-to-spotnana-steve-singh-on-how-ai-could-fix-corporate-travel/', 'From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel', "Sean O'Neill", '2025-08-01T17:19:21', 'Skift', '2025-08-02 14:42:42')
('d030436466546bcf23aa4befbf4d08b6', 'https://skift.com/2025/08/01/delta-says-it-will-not-use-ai-to-target-customers/', 'Delta Responds to AI-Pricing Backlash: No ‘Individualized Prices Based on Personal Data’', 'Meghna Maharishi', '2025-08-01T15:05:14', 'Skift', '2025-08-02 14:40:49')
('8db7ffa1ba14adc8dd8348e7f5d1923d', 'https://skift.com/2025/08/01/u-s-dollar-slide-hurts-accor-minor-and-melia/', 'U.S. Dollar Slide Hurts Accor, Minor, and Meliá', 'Luke Martin', '2025-08-01T13:53:29', 'Skift', '2025-08-02 14:40:49')
('3fa30e2c6eee18976f224053633c1a27', 'https://skift.com/2025/08/01/winners-losers-and-lots-of-premium-seats-europes-airline-scorecard/', 'Winners, Losers, and Lots of Premium Seats: Euro

In [111]:
with sqlite3.connect(DB_PATH) as conn:
    cursor = conn.cursor()

    # Execute a query to select all articles
    cursor.execute("""SELECT Count(*) FROM articles
                    """)

    # Fetch all rows returned by the query
    rows = cursor.fetchall()
    # Process and display results
    for row in rows:
        print(row)

(80,)


In [None]:
1) Query target data source, sqlite, and get max time stamp from the data base. -Done
2) If timestamp is null return latest_timestamp = '2025-07-01T00:00:00' and incremental = False (meaning do full load from this timestamp)
 else return latest_timestamp from database and incremental = True. -Done
3) Now since website is paginated now query the website from latest page and check for the last news on each page is that smaller then latest_timestamp 
if yes then stop else go to next page and repeat process.
4) once you get all the extracted information, just merge the the extracted information in the database so to avoid any duplicates on last page of news.

This will be repeated for the other source too.