In [None]:
pip install beautifulsoup4

In [242]:
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
import hashlib
import os
from requests.exceptions import RequestException
import logging
import time
import random

In [243]:
DB_DIR = r"C:/Users/HP/Documents/repos/news_ingestion_data_pipeline/data"
DB_FILE = "articles.db"
DB_PATH = os.path.join(DB_DIR, DB_FILE)

In [None]:
def get_latest_news_time():
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.execute("SELECT MAX(News_published_time) FROM articles")
        result = cursor.fetchone()
        if result[0]:
            incremental = True
            latest_timestamp = result[0]
        else:
            incremental = False
            latest_timestamp = '2025-07-25T00:00:00'

        latest_timestamp = datetime.fromisoformat(latest_timestamp).replace(hour=0, minute=0, second=0, microsecond=0)
        return (latest_timestamp, incremental)
    
def upsert_articles(filtered_articles):
    with sqlite3.connect(DB_PATH) as conn:
        for article in filtered_articles:
            sql = """
            INSERT INTO articles (Article_id,  News_link, News_title, Author_name, News_published_time, Source_name, Processed_at)
            VALUES (?, ?,  ?, ?, ?, ?, CURRENT_TIMESTAMP)
            ON CONFLICT(Article_id) DO UPDATE SET
                News_link = excluded.News_link,
                News_title = excluded.News_title,
                Author_name = excluded.Author_name,
                News_published_time = excluded.News_published_time,
                Source_name = excluded.Source_name,
                Processed_at = CURRENT_TIMESTAMP
            """
            params = (
                article.get('Article_id'),
                article.get('News_link'),
                article.get('News_title'),
                article.get('Author_name'),
                article.get('News_published_time'),
                article.get('Source_name')
            )
            cursor = conn.cursor()
            cursor.execute(sql, params)

def query_topn_articles(n = 5):
    # Connect to the SQLite database
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()

        # Execute a query to select all articles
        cursor.execute(f"""SELECT Article_id, News_link, News_title,Author_name, News_published_time, Source_name, Processed_at FROM articles
                        ORDER BY News_published_time DESC LIMIT {n}""")

        # Fetch all rows returned by the query
        rows = cursor.fetchall()
        # Process and display results
        for row in rows:
            print(row)


In [245]:
def parse_date(date_str, source= 'Skift'):
    """
    Converts date string like 'July 28, 2025' to datetime object.
    Return None if parse fails.
    """
    try:
        if source == "Phocuswire":
            return datetime.strptime(date_str.strip(), "%B %d, %Y")
        else:
            return datetime.fromisoformat(date_str)
    except Exception:
        return None
    
def drop_timezone(date_str):
    date_str = datetime.fromisoformat(date_str)
    return date_str.strftime("%Y-%m-%dT%H:%M:%S")
    
def generate_article_id(url):
    return hashlib.md5(url.encode("utf-8")).hexdigest()


def datetime_to_iso_with_time(dt):
    """
    Convert a datetime object to ISO8601 string with a fixed time part.

    Args:
        dt (datetime): A datetime object (date part used).

    Returns:
        str: Formatted ISO8601 string in 'YYYY-MM-DDTHH:MM:SS' format
    """
    date_part = dt.strftime("%Y-%m-%d")
    time_str = dt.strftime("%H:%M:%S")
    return f"{date_part}T{time_str}"

def fetch_url_with_retries(url, headers = {}, max_retries=3, backoff_factor=1.0, timeout=10):
    for attempt in range(max_retries):
        try:
            if headers: 
                response = requests.get(url, timeout=timeout, headers=headers)
            else:
                response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response
        except RequestException as e:
            wait = backoff_factor * (2 ** attempt)
            print(f"Request failed: {e}. Retrying in {wait:.1f} seconds (Attempt {attempt + 1} of {max_retries})")
            time.sleep(wait)
    print(f"Failed to fetch {url} after {max_retries} attempts.")
    return None

In [246]:
class SkiftScraper:
    def __init__(self, min_delay=1, max_delay=3, max_pages=15, max_retries = 3, backoff_factor= 1.0,timeout = 10 ):
        """
        Initialize the SkiftScraper.
        :param last_ingested_date: datetime.datetime or None, stop scraping older news
        :param min_delay: minimum delay between requests (seconds)
        :param max_delay: maximum delay between requests (seconds)
        :param max_pages: max number of pages to scrape
        """
        self.base_url = "https://skift.com/news/"
        self.headers = {}
        self.sourcename = 'Skift'
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.max_pages = max_pages
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.timeout = timeout
        self.collected_articles = []
        self.seen_article_ids = set()

    def fetch_url_with_retries(self, url):
        for attempt in range(self.max_retries):
            try:
                if self.headers: 
                    response = requests.get(url, timeout=self.timeout, headers=self.headers)
                else:
                    response = requests.get(url, timeout=self.timeout)
                response.raise_for_status()
                return response
            except RequestException as e:
                wait = self.backoff_factor * (2 ** attempt)
                print(f"Request failed: {e}. Retrying in {wait:.1f} seconds (Attempt {attempt + 1} of {self.max_retries})")
                time.sleep(wait)
        print(f"Failed to fetch {url} after {self.max_retries} attempts.")
        return None

    def get_page_url(self, page):
        return f"{self.base_url}page/{page}/"

    def extract_articles(self,last_ingested_date):
        page = 1

        while True:
            url = self.get_page_url(page)
            response = self.fetch_url_with_retries(url)

            if not response:
                print(f"Stopping scraping due to repeated request failures at page {page}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            articles = soup.select("article")

            if not articles:
                print(f"No articles found on page {page}, stopping.")
                break

            stop_paging = False

            for article in articles:
                link_tag = article.select_one("h3.c-tease__title a")
                if not link_tag:
                    print("Article missing title link, skipping.")
                    continue

                news_url = link_tag.get('href')
                if not news_url:
                    print("Article missing href link, skipping.")
                    continue

                article_id = generate_article_id(news_url)
                if article_id in self.seen_article_ids:
                    print(f"Duplicate article {article_id} found, skipping.")
                    continue
                
                headline = link_tag.text.strip() if link_tag else None
                author_tag = article.select_one("div.c-tease__byline a.underline")
                if not author_tag:
                    print(f"Author not available for article id {article_id} and article headline {headline}.")
                author_name = author_tag.text.strip() if author_tag else None

                time_tag = article.select_one("div.c-tease__byline time")            
                news_time = drop_timezone(time_tag.get("datetime")) if time_tag else None
                
                try:
                    news_time = parse_date(news_time)
                except Exception as e:
                    print(f"Error parsing date '{news_time}': {e}")
                    news_time = None
                if news_time :
                    if last_ingested_date and news_time < last_ingested_date:
                        # If last_ingested_date is set and this article is older or equal, stop ingestion
                        stop_paging = True
                        print(f"Encountered article dated {news_time} < last ingested {last_ingested_date}, stopping.")
                        break
                else:
                    # If no date found, you can decide to skip or include
                    print("Article without date found, skipping date check.")
                

                article_data = {
                    "Article_id": article_id,
                    "News_title": headline,
                    "News_link": news_url,
                    "Author_name": author_name,
                    "News_published_time": datetime_to_iso_with_time(news_time),
                    "Source_name": "Skift"
                }
                self.collected_articles.append(article_data)
                self.seen_article_ids.add(article_id)
                
            if stop_paging:
                break

            page += 1
            delay = random.uniform(self.min_delay, self.max_delay)
            print(f"Sleeping for {delay:.1f} seconds before next page request.")
            time.sleep(delay)

        print(f"Total new articles extracted: {len(self.collected_articles)}")
        return self.collected_articles

In [249]:

class PhocuswireScraper:
    def __init__(self, min_delay=1, max_delay=3, max_pages=15, max_retries = 3, backoff_factor= 1.0,timeout = 10 ):
        """
        Initialize the PhocusewireScraper.
        :param min_delay: minimum delay between requests (seconds)
        :param max_delay: maximum delay between requests (seconds)
        :param max_pages: max number of pages to scrape
        """
        self.base_url = "https://www.phocuswire.com"
        self.headers =  {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                                    'AppleWebKit/537.36 (KHTML, like Gecko) '
                                    'Chrome/115.0.0.0 Safari/537.36'
                        }
        self.min_delay = min_delay
        self.source_name = "Phocuswire"
        self.max_delay = max_delay
        self.max_pages = max_pages
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.timeout = timeout
        self.collected_articles = []
        self.seen_article_ids = set()

    def fetch_url_with_retries(self, url):
        for attempt in range(self.max_retries):
            try:
                if self.headers: 
                    response = requests.get(url, timeout=self.timeout, headers=self.headers)
                else:
                    response = requests.get(url, timeout=self.timeout)
                response.raise_for_status()
                return response
            except RequestException as e:
                wait = self.backoff_factor * (2 ** attempt)
                print(f"Request failed: {e}. Retrying in {wait:.1f} seconds (Attempt {attempt + 1} of {self.max_retries})")
                time.sleep(wait)
        print(f"Failed to fetch {url} after {self.max_retries} attempts.")
        return None

    def get_page_url(self, page):
        return f"{self.base_url}/Latest-News?pg={page}"

    def extract_articles(self,last_ingested_date):
        page = 1

        while True:
            url = self.get_page_url(page)
            response = self.fetch_url_with_retries(url)

            if not response:
                print(f"Stopping scraping due to repeated request failures at page {page}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            articles = soup.select("div.article-list  div.item")

            if not articles:
                print(f"No articles found on page {page}, stopping.")
                break

            stop_paging = False

            for article in articles:
                # Extract news date (inside div.author)
                title_tag = article.select_one("a.title")
                if not title_tag:
                    print("Article missing title link, skipping.")
                    continue

                headline = title_tag.get_text(strip=True) if title_tag else None

                if not title_tag['href']:
                    print("Article missing href link, skipping.")
                    continue

                news_url = f"{self.base_url}/{title_tag['href']}" if title_tag and 'href' in title_tag.attrs else None

                article_id = generate_article_id(news_url) if news_url else None
                if article_id in self.seen_article_ids:
                    print(f"Duplicate article {article_id} found, skipping.")
                    continue

                author_span = article.select_one("div.author > span.name")
                if not author_span:
                    print(f"Author not available for article id {article_id} and article headline {headline}.")
                author_name = author_span.get_text(strip=True).replace("By ", "") if author_span else None
                
                # Extract news time from div.author text after the pipe symbol
                author_div = article.select_one("div.author")
                news_time = None
                if author_div:
                    # The text looks like 'By Abby Crotty | July 28, 2025'
                    # We can split by '|' and strip whitespace to get news time
                    parts = author_div.text.split('|')
                    if len(parts) == 2:
                        news_time = parts[1].strip()
                        
                    try:
                        news_time = parse_date(news_time, self.source_name)
                    except Exception as e:
                        print(f"Error parsing date '{news_time}': {e}")
                        news_time = None
                if news_time :
                    if last_ingested_date and news_time < last_ingested_date:
                        # If last_ingested_date is set and this article is older or equal, stop ingestion
                        print(f"News time : {news_time}")
                        print(f"last -ngested time: {last_ingested_date}")

                        stop_paging = True
                        print(f"Encountered article dated {news_time} < last ingested {last_ingested_date}, stopping.")
                        break
                else:
                    # If no date found, you can decide to skip or include
                    print("Article without date found, skipping date check.")

                
                article_data = {
                    "Article_id": article_id,
                    "News_title": headline,
                    "News_link": news_url,
                    "Author_name": author_name,
                    "News_published_time": datetime_to_iso_with_time(news_time),
                    "Source_name": "Phocuswire"
                }

                self.collected_articles.append(article_data)
                self.seen_article_ids.add(article_id)

            if stop_paging:
                break

            page += 1
            delay = random.uniform(self.min_delay, self.max_delay)
            print(f"Sleeping for {delay:.1f} seconds before next page request.")
            time.sleep(delay)

        print(f"Total new articles extracted: {len(self.collected_articles)}")
        return self.collected_articles

In [255]:
if __name__ == "__main__":
    latest_timestamp, is_incremental = get_latest_news_time()
    print(f"{latest_timestamp}")
    if is_incremental:
        print("Initiating increment load...")
        print("Latest record time stamp present in database : ",latest_timestamp)
    else:
        print("Latest record timestamp not found in database.")
        print("Initiating full load...")
    extracted_articles = []
    
    Skriftscraper = SkiftScraper()
    skift_articles = Skriftscraper.extract_articles(latest_timestamp)
    extracted_articles.extend(skift_articles)
    Phocuswirescraper = PhocuswireScraper()
    phocuswire_articles = Phocuswirescraper.extract_articles(latest_timestamp)
    extracted_articles.extend(phocuswire_articles)

    print("Total articles extracted : ", len(extracted_articles))

    upsert_articles(extracted_articles)

  


2025-08-01 00:00:00
Initiating increment load...
Latest record time stamp present in database :  2025-08-01 00:00:00
Sleeping for 1.3 seconds before next page request.
Encountered article dated 2025-07-31 17:48:19 < last ingested 2025-08-01 00:00:00, stopping.
Total new articles extracted: 12
News time : 2025-07-31 00:00:00
last -ngested time: 2025-08-01 00:00:00
Encountered article dated 2025-07-31 00:00:00 < last ingested 2025-08-01 00:00:00, stopping.
Total new articles extracted: 3
Total articles extracted :  15


In [256]:
query_topn_articles(18)

('b6180012cdfcaab01451bded2196d26c', 'https://skift.com/2025/08/01/from-concur-to-spotnana-steve-singh-on-how-ai-could-fix-corporate-travel/', 'From Concur to Spotnana: Steve Singh on How AI Could Fix Corporate Travel', "Sean O'Neill", '2025-08-01T17:19:21', 'Skift', '2025-08-02 21:18:26')
('d030436466546bcf23aa4befbf4d08b6', 'https://skift.com/2025/08/01/delta-says-it-will-not-use-ai-to-target-customers/', 'Delta Responds to AI-Pricing Backlash: No ‘Individualized Prices Based on Personal Data’', 'Meghna Maharishi', '2025-08-01T15:05:14', 'Skift', '2025-08-02 21:18:26')
('8db7ffa1ba14adc8dd8348e7f5d1923d', 'https://skift.com/2025/08/01/u-s-dollar-slide-hurts-accor-minor-and-melia/', 'U.S. Dollar Slide Hurts Accor, Minor, and Meliá', 'Luke Martin', '2025-08-01T13:53:29', 'Skift', '2025-08-02 21:18:26')
('3fa30e2c6eee18976f224053633c1a27', 'https://skift.com/2025/08/01/winners-losers-and-lots-of-premium-seats-europes-airline-scorecard/', 'Winners, Losers, and Lots of Premium Seats: Euro

In [254]:
with sqlite3.connect(DB_PATH) as conn:
    cursor = conn.cursor()

    # Execute a query to select all articles
    cursor.execute("""SELECT Count(*) FROM articles
                    """)

    # Fetch all rows returned by the query
    rows = cursor.fetchall()
    # Process and display results
    for row in rows:
        print(row)

(97,)
