In [3]:
import os
import json
import re
import requests
from urllib.parse import urlparse
import concurrent.futures
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from datetime import datetime
import dateutil.parser
import warnings
from dateutil.tz import gettz
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Suppress specific warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# Mapping for unknown timezones
tzinfos = {
    'EDT': gettz('America/New_York'),
    'EST': gettz('America/New_York'),
    'PST': gettz('America/Los_Angeles'),
    'PDT': gettz('America/Los_Angeles'),
    'IST': gettz('Asia/Kolkata'),
    'PT': gettz('America/Los_Angeles')
}

def parse_date(date_str):
    try:
        return dateutil.parser.parse(date_str, fuzzy=True, tzinfos=tzinfos).strftime('%Y-%m-%d')
    except:
        return None

def calculate_weight(published_date):
    if not published_date:
        return 0.5  # Default weight for unknown dates
    
    date = datetime.strptime(published_date, '%Y-%m-%d')
    today = datetime.now()
    age = (today - date).days
    
    if age <= 30:  # Very recent (last month)
        return 1.0
    elif age <= 365:  # Within the last year
        return 0.8
    elif age <= 3*365:  # Within the last 3 years
        return 0.6
    else:  # Older than 3 years
        return 0.4

def extract_date(url, html_content, content_type):
    if 'xml' in content_type.lower():
        soup = BeautifulSoup(html_content, 'xml')
    else:
        soup = BeautifulSoup(html_content, 'lxml')

    potential_dates = []

    # 1. Check meta tags
    meta_tags = soup.find_all('meta')
    for tag in meta_tags:
        if tag.get('property') in ['article:published_time', 'og:published_time', 'pubdate', 'datePublished', 'date']:
            potential_dates.append(tag.get('content'))

    # 2. Check LD+JSON
    ld_json = soup.find_all('script', type='application/ld+json')
    for script in ld_json:
        try:
            data = json.loads(script.string)
            if isinstance(data, list):
                data = data[0]
            date = data.get('datePublished') or data.get('dateCreated') or data.get('dateModified')
            if date:
                potential_dates.append(date)
        except:
            continue

    # 3. Check time tags
    time_tags = soup.find_all('time')
    for tag in time_tags:
        if tag.has_attr('datetime'):
            potential_dates.append(tag['datetime'])
        elif tag.string:
            potential_dates.append(tag.string)

    # 4. Look for date patterns in text
    text = soup.get_text()
    date_patterns = [
        r'\\d{4}-\\d{2}-\\d{2}',
        r'\\d{2}/\\d{2}/\\d{4}',
        r'\\d{2}-\\d{2}-\\d{4}',
        r'\\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\\s+\\d{1,2},?\\s+\\d{4}\\b',
        r'\\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s+\\d{1,2},?\\s+\\d{4}\\b'
    ]
    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        potential_dates.extend(matches)

    # 5. Check URL for date
    url_date_patterns = [
        r'/(\\d{4}/\\d{2}/\\d{2})/',
        r'(\\d{4}-\\d{2}-\\d{2})',
        r'(\\d{2}-\\d{2}-\\d{4})',
    ]
    for pattern in url_date_patterns:
        match = re.search(pattern, url)
        if match:
            potential_dates.append(match.group(1))

    # Parse and validate all potential dates
    valid_dates = [parse_date(date) for date in potential_dates if parse_date(date)]
    
    if valid_dates:
        # Return the oldest date found
        return min(valid_dates)
    
    return None

def analyze_url(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        known_domains = {
            'twitter.com': 'Twitter',
            'x.com': 'Twitter',
            'github.com': 'GitHub',
            'youtube.com': 'YouTube',
            'youtu.be': 'YouTube',
            'linkedin.com': 'LinkedIn',
            'facebook.com': 'Facebook',
            'fb.com': 'Facebook',
            'instagram.com': 'Instagram',
            'reddit.com': 'Reddit',
            'medium.com': 'Medium',
            'stackoverflow.com': 'Stack Overflow',
            'wikipedia.org': 'Wikipedia',
            'amazon.com': 'Amazon',
            'dropbox.com': 'Dropbox',
        }
        
        for known_domain, site_type in known_domains.items():
            if known_domain in domain:
                return site_type, None

        # Setting up a retry strategy
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = session.get(url, headers=headers, timeout=10)
        content_type = response.headers.get('Content-Type', '').lower()
        
        published_date = extract_date(url, response.text, content_type)

        if 'text/html' in content_type or 'application/xhtml+xml' in content_type:
            return 'Web Page', published_date
        elif 'application/pdf' in content_type:
            return 'PDF', published_date
        elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif']):
            return 'Image', published_date
        elif 'application/json' in content_type:
            return 'API', published_date
        elif any(doc_type in content_type for doc_type in ['msword', 'vnd.openxmlformats-officedocument', 'vnd.ms-excel']):
            return 'Document', published_date
        else:
            return 'Unknown', published_date
    except Exception as e:
        return f'Error: {str(e)}', None

def process_url(url):
    url_type, published_date = analyze_url(url)
    weight = calculate_weight(published_date)
    return {
        "url": url,
        "url_type": url_type,
        "published_date": published_date,
        "weight": weight
    }

def update_json_with_dates(json_file):
    with open(json_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    hyperlinks = data.get('hyperlinks', [])
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(process_url, hyperlinks))
    
    data['hyperlinks'] = results
    
    with open(json_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=2)

    print(f"Updated JSON file: {json_file}")

def process_all_json_files(directory):
    json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
    total_files = len(json_files)
    processed_files = 0
    total_urls = 0
    total_dates_extracted = 0

    for json_file in json_files:
        full_path = os.path.join(directory, json_file)
        update_json_with_dates(full_path)
        
        # Count URLs and extracted dates
        with open(full_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            urls = data.get('hyperlinks', [])
            total_urls += len(urls)
            total_dates_extracted += sum(1 for url in urls if url['published_date'])
        
        processed_files += 1
        print(f"Processed {processed_files}/{total_files} files")

    print(f"\\nTotal JSON files processed: {total_files}")
    print(f"Total URLs processed: {total_urls}")
    print(f"Total URLs with extracted dates: {total_dates_extracted}")

# Directory containing JSON files
json_directory = r'C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown'

# Process all JSON files in the directory
process_all_json_files(json_directory)

Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Agentic Metaverse for Global Creatives.md.meta.json
Processed 1/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Agentic Mycelia.md.meta.json
Processed 2/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Agents.md.meta.json
Processed 3/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\AI Adoption.md.meta.json
Processed 4/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\AI Companie

Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Digital Society Surveillance.md.meta.json
Processed 58/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Distributed Identity.md.meta.json
Processed 59/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\ecash.md.meta.json
Processed 60/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Education.md.meta.json
Processed 61/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Energy and Powe

Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\NVIDIA Omniverse.md.meta.json
Processed 112/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Octave Multi Model Laboratory.md.meta.json
Processed 113/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Open Generative AI tools.md.meta.json
Processed 114/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\OpenAI.md.meta.json
Processed 115/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdow

Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Stable Diffusion.md.meta.json
Processed 151/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\State of the art in AI.md.meta.json
Processed 152/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\State Space and Other Approaches.md.meta.json
Processed 153/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown\\MetaJson markdown\Suggested Reading Order.md.meta.json
Processed 154/167 files
Updated JSON file: C:\\Users\\lolic\\OneDrive\\Desktop\\work stuff\\MediaCity Immersive Innovation Hub(Dreamlab)\\WebScraping\\markdown