updated code to capture dates from all possible elements

In [1]:
import re
import aiohttp
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.parser import parse, UnknownTimezoneWarning
from aiohttp import ClientSession
import nest_asyncio
import warnings
from IPython.display import display

# Apply nest_asyncio to allow nested event loops in Jupyter notebooks
nest_asyncio.apply()

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
warnings.filterwarnings("ignore", category=UnknownTimezoneWarning)

def extract_dates_from_html(html_content):
    date_patterns = [
        r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b',  # Matches dates like 12/31/2020, 12-31-2020
        r'\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b',    # Matches dates like 2020/12/31, 2020-12-31
        r'\b\d{1,2} \w+ \d{4}\b',              # Matches dates like 31 December 2020
        r'\b\w+ \d{1,2}, \d{4}\b',             # Matches dates like December 31, 2020
    ]
    dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, html_content)
        for match in matches:
            try:
                parsed_date = parse(match, fuzzy=True, tzinfos={"EST": -18000})
                dates.append(parsed_date.isoformat())
            except ValueError:
                continue
    return dates

def extract_published_date(soup, url):
    # Special handling for GitHub URLs
    if 'github.com' in url:
        return extract_github_date(url)
    
    # General meta tags and common class names
    date_patterns = [
        {'name': 'meta', 'attr': 'property', 'value': 'article:published_time'},
        {'name': 'meta', 'attr': 'property', 'value': 'og:published_time'},
        {'name': 'meta', 'attr': 'name', 'value': 'date'},
        {'name': 'meta', 'attr': 'itemprop', 'value': 'datePublished'},
        {'name': 'meta', 'attr': 'property', 'value': 'datePublished'},
        {'name': 'meta', 'attr': 'name', 'value': 'publish_date'},
        {'name': 'meta', 'attr': 'name', 'value': 'PubDate'},
    ]
    
    for pattern in date_patterns:
        tag = soup.find(pattern['name'], {pattern['attr']: pattern['value']})
        if tag and tag.get('content'):
            try:
                return parse(tag['content'], fuzzy=True, tzinfos={"EST": -18000}).isoformat()
            except ValueError:
                continue
    
    # Check for common class names and text locations
    date_classes = ['published-date', 'pub-date', 'date', 'publish-date', 'date-posted', 'posted-date']
    for class_name in date_classes:
        tag = soup.find(class_=class_name)
        if tag:
            try:
                return parse(tag.get_text(), fuzzy=True, tzinfos={"EST": -18000}).isoformat()
            except ValueError:
                continue
    
    # Check for dates near title elements
    title_tag = soup.find(['h1', 'h2', 'h3'])
    if title_tag:
        sibling_text = ' '.join(sibling.get_text() for sibling in title_tag.next_siblings if sibling.name in ['p', 'div', 'span'])
        dates = extract_dates_from_html(sibling_text)
        if dates:
            return dates[0]

    # Check entire text of the article as last resort
    text_content = soup.get_text()
    dates = extract_dates_from_html(text_content)
    return dates[0] if dates else 'No published date found'

async def extract_github_date(url):
    api_url = url.replace("https://github.com/", "https://api.github.com/repos/")
    async with aiohttp.ClientSession() as session:
        async with session.get(api_url) as response:
            data = await response.json()
            if 'created_at' in data:
                return parse(data['created_at']).isoformat()
            return 'No date found'

async def fetch(session, url):
    try:
        async with session.get(url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            date = extract_published_date(soup, url)
            return url, date
    except Exception as e:
        return url, f'Error: {str(e)}'

async def process_urls_async(urls):
    url_dates = []
    async with ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        for task in asyncio.as_completed(tasks):
            url, date = await task
            url_dates.append({'URL': url, 'Published Date': date})
    return url_dates

# Example usage:
csv_file_path = r'C:\Users\lolic\OneDrive\Desktop\work stuff\MediaCity Immersive Innovation Hub(Dreamlab)\WebScraping\extracted_urls.csv'  # Path to your CSV file with URLs
df = pd.read_csv(csv_file_path)
urls = df['URL'].tolist()

# Process URLs asynchronously
url_dates = asyncio.run(process_urls_async(urls))

# Convert the results to a DataFrame for better visualization
df_dates = pd.DataFrame(url_dates)

# Save to a new CSV file
output_file_path = r'C:\Users\lolic\OneDrive\Desktop\work stuff\MediaCity Immersive Innovation Hub(Dreamlab)\WebScraping\url_dates.csv'  # Desired output path
df_dates.to_csv(output_file_path, index=False)

# Display the DataFrame
display(df_dates)
print(f"Published dates have been saved to {output_file_path}")


Unnamed: 0,URL,Published Date
0,https://[[ComfyWorkFlows]].com/,Error: https://[[ComfyWorkFlows]].com/
1,https://[[ComfyWorkFlows]].com/,Error: https://[[ComfyWorkFlows]].com/
2,https://miro.com/app/board/uXjVPzJyAtU=/,No published date found
3,https://openai.com/blog/chatgpt,No published date found
4,https://customgpt.ai,2023-06-09T14:28:29-04:00
...,...,...
2826,https://arxiv.org/pdf/2102.12092.pdf,Error: 'utf-8' codec can't decode byte 0x8f in...
2827,https://arxiv.org/pdf/2309.17444.pdf,Error: 'utf-8' codec can't decode byte 0x8f in...
2828,https://advisor.morganstanley.com/daron.edward...,Error:
2829,https://www.minecraft.net/en-us/article/minecr...,Error:


Published dates have been saved to C:\Users\lolic\OneDrive\Desktop\work stuff\MediaCity Immersive Innovation Hub(Dreamlab)\WebScraping\url_dates.csv
