In [None]:
import os
import re
import csv
import requests
from urllib.parse import urlparse
import concurrent.futures
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from datetime import datetime
import dateutil.parser
import json
import warnings
from dateutil.relativedelta import relativedelta
from dateutil.tz import gettz
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# Mapping for unknown timezones
tzinfos = {
    'EDT': gettz('America/New_York'),
    'EST': gettz('America/New_York'),
    'PST': gettz('America/Los_Angeles'),
    'PDT': gettz('America/Los_Angeles'),
    'IST': gettz('Asia/Kolkata'),
    'PT': gettz('America/Los_Angeles')
}

def parse_date(date_str):
    try:
        return dateutil.parser.parse(date_str, fuzzy=True, tzinfos=tzinfos).strftime('%Y-%m-%d')
    except:
        return None

def extract_date(url, html_content, content_type):
    if 'xml' in content_type:
        soup = BeautifulSoup(html_content, 'xml')
    else:
        soup = BeautifulSoup(html_content, 'html.parser')

    potential_dates = []

    # 1. Check meta tags
    meta_tags = soup.find_all('meta')
    for tag in meta_tags:
        if tag.get('property') in ['article:published_time', 'og:published_time', 'pubdate', 'datePublished', 'date']:
            potential_dates.append(tag.get('content'))

    # 2. Check LD+JSON
    ld_json = soup.find_all('script', type='application/ld+json')
    for script in ld_json:
        try:
            data = json.loads(script.string)
            if isinstance(data, list):
                data = data[0]
            date = data.get('datePublished') or data.get('dateCreated') or data.get('dateModified')
            if date:
                potential_dates.append(date)
        except:
            continue

    # 3. Check time tags
    time_tags = soup.find_all('time')
    for tag in time_tags:
        if tag.has_attr('datetime'):
            potential_dates.append(tag['datetime'])
        elif tag.string:
            potential_dates.append(tag.string)

    # 4. Look for date patterns in text
    text = soup.get_text()
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}',
        r'\d{2}/\d{2}/\d{4}',
        r'\d{2}-\d{2}-\d{4}',
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}\b'
    ]
    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        potential_dates.extend(matches)

    # 5. Check URL for date
    url_date_patterns = [
        r'/(\d{4}/\d{2}/\d{2})/',
        r'(\d{4}-\d{2}-\d{2})',
        r'(\d{2}-\d{2}-\d{4})',
    ]
    for pattern in url_date_patterns:
        match = re.search(pattern, url)
        if match:
            potential_dates.append(match.group(1))

    # Parse and validate all potential dates
    valid_dates = [parse_date(date) for date in potential_dates if parse_date(date)]
    
    if valid_dates:
        # Return the oldest date found
        return min(valid_dates)
    
    return None

def analyze_url(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        known_domains = {
            'twitter.com': 'Twitter',
            'x.com': 'Twitter',
            'github.com': 'GitHub',
            'youtube.com': 'YouTube',
            'youtu.be': 'YouTube',
            'linkedin.com': 'LinkedIn',
            'facebook.com': 'Facebook',
            'fb.com': 'Facebook',
            'instagram.com': 'Instagram',
            'reddit.com': 'Reddit',
            'medium.com': 'Medium',
            'stackoverflow.com': 'Stack Overflow',
            'wikipedia.org': 'Wikipedia',
            'amazon.com': 'Amazon',
            'dropbox.com': 'Dropbox',
        }
        
        for known_domain, site_type in known_domains.items():
            if known_domain in domain:
                return site_type, None

        # Setting up a retry strategy
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = session.get(url, headers=headers, timeout=10)
        content_type = response.headers.get('Content-Type', '').lower()
        
        published_date = extract_date(url, response.text, content_type)

        if 'text/html' in content_type or 'application/xhtml+xml' in content_type:
            return 'Web Page', published_date
        elif 'application/pdf' in content_type:
            return 'PDF', published_date
        elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif']):
            return 'Image', published_date
        elif 'application/json' in content_type:
            return 'API', published_date
        elif any(doc_type in content_type for doc_type in ['msword', 'vnd.openxmlformats-officedocument', 'vnd.ms-excel']):
            return 'Document', published_date
        else:
            return 'Unknown', published_date
    except Exception as e:
        return f'Error: {str(e)}', None

def process_url(row):
    filename, url = row
    url_type, published_date = analyze_url(url)
    return [filename, url, url_type, published_date]

def update_csv_with_types_and_dates(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        header = next(reader)
        header.extend(['URL Type', 'Published Date'])
        writer.writerow(header)
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(process_url, reader))
        
        writer.writerows(results)

    print(f"Updated CSV file saved as {output_file}")

# File paths
input_csv = r'C:\Users\lolic\OneDrive\Desktop\work stuff\MediaCity Immersive Innovation Hub(Dreamlab)\WebScraping\Sonnet3.5\extracted_urls.csv'
output_csv = r'C:\Users\lolic\OneDrive\Desktop\work stuff\MediaCity Immersive Innovation Hub(Dreamlab)\WebScraping\Sonnet3.5\extracted_urls_with_types_and_dates.csv'

update_csv_with_types_and_dates(input_csv, output_csv)

url_type_counts = {}
date_count = 0
with open(output_csv, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        url_type = row[2]
        url_type_counts[url_type] = url_type_counts.get(url_type, 0) + 1
        if row[3]:  # Count non-empty dates
            date_count += 1

print("\nURL Type Summary:")
for url_type, count in url_type_counts.items():
    print(f"{url_type}: {count}")

print(f"\nTotal URLs with extracted dates: {date_count}")
