In [2]:
# import necessary libraries

from bs4 import BeautifulSoup # To parse and extract data from HTML
import requests # To send HTTP requests to the website
import time
import datetime

import smtplib

In [5]:
# connect to the website and get the article names

# URL of the website we want to scrape
URL = "https://www.dogonews.com/"

# Find Your User-Agent: https://httpbin.org/get
# Set headers to simulate a real browser request (some websites block non-browser traffic)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}

# Send an HTTP GET request to the URL, passing in the headers
page = requests.get(URL, headers=headers)

# Check if the page request was successful (status code 200)
# If unsuccessful, raise an exception (not shown here, but you can add error handling)
# We can check the status code of the response with `page.status_code`

# Parse the page content using BeautifulSoup (HTML parsing)
soup1 = BeautifulSoup(page.content, "html.parser")

# Use prettify() to get a pretty-formatted HTML string, then parse it again with BeautifulSoup
# This step isn't strictly necessary but it can make the HTML more readable
soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

# Find all <h2> tags on the page, which are likely to contain the article titles
# The article titles are often inside <a> tags nested within <h2> tags
headings = soup2.find_all('h2')

# initialize a list to store all the article titles of the day
titles = []

# Loop through each <h2> tag found on the page
for heading in headings:
    # Within each <h2> tag, try to find the <a> tag that contains the article title
    parsedHtml = heading.find('a')

    # If an <a> tag is found (i.e., the article title exists)
    if parsedHtml:
        # Extract the text content from the <a> tag and remove any extra whitespace (e.g., newlines)
        title = parsedHtml.get_text().strip()
        # add the article to the articles list
        titles.append(title)
        # Print the article title to the console
        print(title)
    else:
        # If no <a> tag is found inside this <h2>, print a message (this usually shouldn't happen for valid articles)
        print("No Text found.")


Oops! Kentucky 2nd Grader Accidentally Orders Over 70,000 Lollipops!
Thriving Ecosystem Discovered Beneath Antarctic Ice
Why The European Space Agency Paid Volunteers Thousands To Lie Flat
No Text found.
Mia DaPonte Is America's Youngest Female Master Scuba Diver
Cardinal Robert Prevost Makes History As First American Pope
The Ice Bucket Challenge Returns With A New Purpose
The Papal Conclave Explained
Philadelphia Zoo Celebrates Birth Of Critically Endangered Tortoises
Get Ready For Mother's Day!
Researchers May Have Found A New Color!
Sharks May Not Be Silent After All!
Baseball’s New Power Weapon: The Torpedo Bat
Free Comic Book Day Is On May 3rd!
No Text found.


In [4]:
# Create a Timestamp for your output to track when data was collected

import datetime

# Get today's date
today = datetime.date.today()
print(today)

2025-05-17


In [6]:
import csv


# Header for the CSV file (names of the columns)
header = ['No.', 'Articles', 'Date']

# Create a list of article numbers (e.g., 1, 2, 3, ...)
No = [num for num in range(1,len(headings))]

data = [No, titles, today]

# Open the CSV file in write mode
with open('DogoNewsArticlesList.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)

    # Write the header row to the CSV file
    writer.writerow(header)

    # Write each article as a new row with the article number, title, and date
    for num, title in zip(No, titles):
        writer.writerow([num, title, today])