In [None]:
# !pip install internetarchive
# !pip install --upgrade pandas
# !pip install --upgrade requests
# !pip install --upgrade beautifulsoup4
# !pip install -U pip
# !pip install -U pip matplotlib

## WAYBACK MACHINE SCRAPER

Code to retrieve data from the **Wayback Machine** using the *Wayback CDX Server API*: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server




#### Libraries

In [None]:
import requests
import matplotlib.pyplot as plt
import re
import time
import os
import json
import random
from datetime import datetime
from bs4 import BeautifulSoup
import difflib


#### Parameters

In [2]:

CDX_API = "http://web.archive.org/cdx/search/cdx"
BASE_URL = "http://web.archive.org/web"
TARGET_DOMAIN = "facebook.com"
TARGET_PATH = "communitystandards"
START_YEAR = 2011
END_YEAR = 2025
FETCH_LIMIT = 50000

# Rate limiting
MIN_DELAY = 2  # seconds
MAX_DELAY = 5  # seconds


### Load and save (update) progress file

In [3]:

OUTPUT_DIR = "community_standards_archives"
PROGRESS_FILE = "progress.json"

def load_progress():
    """Load progress from file if it exists."""
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            return json.load(f)
    return {
        "last_timestamp": None,
        "visited_links": []  # to track subpages visited
    }

def save_progress(progress_data):
    """Save current progress to file."""
    with open(PROGRESS_FILE, "w") as f:
        json.dump(progress_data, f, indent=2)



### Fetch data

In [4]:

def fetch_cdx_data():
    """
    Fetch the CDX data from the Wayback Machine for the target URL
    from 2011 (oldest record) to 2025, using the resumption key approach.
    
    This function includes error handling and retry logic for timeouts.
    """
    all_records = []
    resume_key = None
    first_chunk = True

    while True:
        params = {
            "url": f"{TARGET_DOMAIN}/{TARGET_PATH}",
            "from": START_YEAR,
            "to": END_YEAR,
            "output": "json",
            "collapse": "digest",  # Remove exact duplicates
            "showResumeKey": "true",  # request the resumption key (string literal)
            "limit": FETCH_LIMIT
        }
        if resume_key:
            params["resumeKey"] = resume_key

        try:
            response = requests.get(CDX_API, params=params, timeout=30)
            response.raise_for_status()
        except requests.exceptions.ReadTimeout:
            print("CDX API request timed out. Retrying in 5 seconds...")
            time.sleep(5)
            continue
        except Exception as e:
            print(f"Error fetching CDX data: {e}")
            break

        data = response.json()
        if not data:
            break

        # If this is the first chunk, skip header row
        start_idx = 1 if first_chunk else 0
        first_chunk = False

        # Look for a resumption key (if present, typically at the end of the data)
        potential_key = None
        if len(data) >= 2:
            if data[-2] == [] and isinstance(data[-1], list) and len(data[-1]) == 1:
                potential_key = data[-1][0]

        if potential_key:
            chunk_records = data[start_idx:-2]
        else:
            chunk_records = data[start_idx:]

        if not chunk_records:
            break

        all_records.extend(chunk_records)

        if potential_key:
            resume_key = potential_key
        else:
            break  # No resume key means no more results

        # Delay to avoid hammering the server
        delay = random.uniform(MIN_DELAY, MAX_DELAY)
        time.sleep(delay)

    return all_records



In [5]:
# In case the previous function doesn't work

'''def fetch_cdx_data():
    """Fetch the CDX data from the Wayback Machine for the target URL."""
    params = {
        "url": f"{TARGET_DOMAIN}/{TARGET_PATH}",
        "from": START_YEAR,
        "to": END_YEAR,
        "output": "json",
        "collapse": "digest",  # Remove exact duplicates
    }
    print("Querying CDX API...")
    resp = requests.get(CDX_API, params=params, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    # data[0] is the header row from CDX. I skip that.
    # The rest are the captures: [timestamp, original, mime, status, digest, length, ...]
    return data[1:]'''

'def fetch_cdx_data():\n    """Fetch the CDX data from the Wayback Machine for the target URL."""\n    params = {\n        "url": f"{TARGET_DOMAIN}/{TARGET_PATH}",\n        "from": START_YEAR,\n        "to": END_YEAR,\n        "output": "json",\n        "collapse": "digest",  # Remove exact duplicates\n    }\n    print("Querying CDX API...")\n    resp = requests.get(CDX_API, params=params, timeout=30)\n    resp.raise_for_status()\n    data = resp.json()\n    # data[0] is the header row from CDX. I skip that.\n    # The rest are the captures: [timestamp, original, mime, status, digest, length, ...]\n    return data[1:]'

### Handling duplicates 
If there are more than 2 captures in a single day, pick random samples:
Strategy:
- if day has up to 5 captures, pick 1 random
- if day has up to 10 captures, pick 2 random
- if day has up to 20 captures, pick 3 random
- if day has > 20 captures, pick 5 random

In [6]:

RANDOM_SAMPLES_BY_COUNT = [(5, 1), (10, 2), (30, 3), (90, 5), (9999, 10)]  # fallback

def get_num_random_samples(num_captures):
    """Given the number of captures in a day, decide how many random samples to pick."""
    for threshold, random_count in RANDOM_SAMPLES_BY_COUNT:
        if num_captures <= threshold:
            return random_count
    return 5  # fallback

def group_by_day(cdx_records):
    """
    Group captures by date (YYYY-MM-DD).
    Each record is typically: ["urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "length", ...]
    """
    daily_groups = {}
    for record in cdx_records:
        # Use record[1] as the timestamp, not record[0]
        timestamp = record[1]
        date_str = timestamp[:8]  # "YYYYMMDD"
        date_fmt = datetime.strptime(date_str, "%Y%m%d").date()  # date object
        daily_groups.setdefault(date_fmt, []).append(record)
    return daily_groups

def deduplicate_day(records):
    """
    Given a list of captures for a single day, apply the logic:
    - If there are 2 captures, keep both
    - If there are more than 2, keep first, last, and pick random ones in between
    """
    if len(records) <= 2:
        return records

    # Sort records by timestamp (record[1] holds the timestamp)
    records.sort(key=lambda r: r[1])
    first_record = records[0]
    last_record = records[-1]
    middle_records = records[1:-1]

    num_random = get_num_random_samples(len(middle_records))
    random_middle = random.sample(middle_records, min(num_random, len(middle_records)))

    deduped = [first_record] + random_middle + [last_record]
    deduped.sort(key=lambda r: r[1])
    return deduped



### Extract internal links to follow

In [None]:

def build_wayback_url(timestamp):
    """Build the Wayback Machine URL to fetch the page HTML."""
    # The 'id_' prefix loads the raw page, ignoring frame-based rewrite
    return f"{BASE_URL}/{timestamp}id_/{TARGET_DOMAIN}/{TARGET_PATH}"

def fetch_html(url):
    """Fetch HTML content from a Wayback URL, with delay and basic retry on timeout."""
    delay = random.uniform(MIN_DELAY, MAX_DELAY)
    time.sleep(delay)
    try:
        resp = requests.get(url, timeout=30)
        resp.raise_for_status()
    except requests.exceptions.ReadTimeout:
        print(f"Timeout when fetching HTML for {url}. Retrying in 5 seconds...")
        time.sleep(5)
        return fetch_html(url)  # Retry fetching
    except Exception as e:
        print(f"Error fetching HTML for {url}: {e}")
        return None

    return resp.text


'''
    APPARENTLY THE extract_internal_links() FUNCTION DOESN'T WORK
    NO FILES STRTING WITH `internal_links—` were created
    '''
def extract_internal_links(soup, base_timestamp):
    """
    Extract links that match the pattern:
      https://web.archive.org/web/<timestamp>/...facebook.com/communitystandards...
    or relative links that eventually point to the same domain/path.
    Return them as a list of (absolute) Wayback Machine URLs.
    """
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "facebook.com/communitystandards" in href:
            if href.startswith("http://web.archive.org/web/") or href.startswith("https://web.archive.org/web/"):
                links.append(href)
            else:
                if href.startswith("http"):
                    archived = f"{BASE_URL}/{base_timestamp}id_/{href.replace('https://','').replace('http://','')}"
                    links.append(archived)
                else:
                    archived = f"{BASE_URL}/{base_timestamp}id_/{TARGET_DOMAIN}/{TARGET_PATH}/{href.lstrip('/')}"
                    links.append(archived)
    return list(set(links))  # unique


### Compare text content

In [None]:
def compare_texts(old_text, new_text):
    """
    Compare old_text and new_text, return a diff as a list of lines.
    Use Python’s difflib.unified_diff for a simple approach.
    """
    old_lines = old_text.splitlines()
    new_lines = new_text.splitlines()
    diff = difflib.unified_diff(old_lines, new_lines, fromfile='old', tofile='new', lineterm='')
    return list(diff)

def crawl_capture(timestamp, progress_data, visited=None):
    """
    Crawl a single capture (and sub-links within the same domain/path).
    visited is a set of Wayback URLs already processed to avoid loops.
    """
    if visited is None:
        visited = set()

    url = build_wayback_url(timestamp)
    if url in visited:
        return
    visited.add(url)

    html = fetch_html(url)
    if not html:
        return

    date_str = timestamp[:8]
    time_str = timestamp[8:]
    out_dir = os.path.join(OUTPUT_DIR, date_str)
    os.makedirs(out_dir, exist_ok=True)

    filename = f"capture_{date_str}_{time_str}.html"
    file_path = os.path.join(out_dir, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    last_timestamp = progress_data.get("last_timestamp")
    if last_timestamp and last_timestamp != timestamp:
        prev_date_str = last_timestamp[:8]
        prev_time_str = last_timestamp[8:]
        prev_file_path = os.path.join(OUTPUT_DIR, prev_date_str, f"capture_{prev_date_str}_{prev_time_str}.html")
        if os.path.exists(prev_file_path):
            with open(prev_file_path, "r", encoding="utf-8") as f:
                old_html = f.read()
            diff_result = compare_texts(old_html, html)
            if diff_result:
                diff_file = os.path.join(out_dir, f"diff_{date_str}_{time_str}_vs_{last_timestamp}.txt")
                with open(diff_file, "w", encoding="utf-8") as df:
                    df.write("\n".join(diff_result))

    progress_data["last_timestamp"] = timestamp
    save_progress(progress_data)

    soup = BeautifulSoup(html, "html.parser")
    internal_links = extract_internal_links(soup, timestamp)

    external_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "facebook.com/communitystandards" not in href:
            external_links.append((a.text.strip(), href))

    ext_links_file = os.path.join(out_dir, f"external_links_{date_str}_{time_str}.txt")
    with open(ext_links_file, "w", encoding="utf-8") as ef:
        for text_val, link_val in external_links:
            ef.write(f"{text_val} -> {link_val}\n")

    for link in internal_links:
        if link not in visited:
            crawl_capture_link(link, progress_data, visited)

def crawl_capture_link(wayback_url, progress_data, visited):
    """
    Variation of crawl_capture that accepts a fully formed Wayback URL.
    Extract the <timestamp> from the URL to maintain consistent naming.
    """
    if wayback_url in visited:
        return
    visited.add(wayback_url)

    parts = wayback_url.split("/web/")
    if len(parts) < 2:
        return
    after_web = parts[1]
    ts_part = after_web.split("/")[0]  # e.g. "20200202020202id_"
    raw_ts = ts_part[:14]

    delay = random.uniform(MIN_DELAY, MAX_DELAY)
    time.sleep(delay)
    try:
        resp = requests.get(wayback_url, timeout=30)
        resp.raise_for_status()
    except requests.exceptions.ReadTimeout:
        print(f"Timeout while fetching {wayback_url}. Retrying in 5 seconds...")
        time.sleep(5)
        return
    except Exception as e:
        print(f"Error fetching {wayback_url}: {e}")
        return

    html = resp.text

    date_str = raw_ts[:8]
    time_str = raw_ts[8:]
    out_dir = os.path.join(OUTPUT_DIR, date_str)
    os.makedirs(out_dir, exist_ok=True)

    filename = f"capture_{date_str}_{time_str}.html"
    file_path = os.path.join(out_dir, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    progress_data["last_timestamp"] = raw_ts
    save_progress(progress_data)

    soup = BeautifulSoup(html, "html.parser")
    '''
    APPARENTLY THE extract_internal_links() FUNCTION DOESN'T WORK
    '''
    internal_links = extract_internal_links(soup, raw_ts)

    external_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "facebook.com/communitystandards" not in href:
            external_links.append((a.text.strip(), href))

    ext_links_file = os.path.join(out_dir, f"external_links_{date_str}_{time_str}.txt")
    with open(ext_links_file, "w", encoding="utf-8") as ef:
        for text_val, link_val in external_links:
            ef.write(f"{text_val} -> {link_val}\n")

    for link in internal_links:
        if link not in visited:
            crawl_capture_link(link, progress_data, visited)



In [9]:
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    progress_data = load_progress()

    # 1) Get all captures from CDX
    cdx_data = fetch_cdx_data()
    print(f"Total records fetched: {len(cdx_data)}")
    if not cdx_data:
        print("No data returned from CDX API.")
        return

    # 2) Group by day
    daily_groups = group_by_day(cdx_data)

    # 3) Sort days in ascending order
    sorted_days = sorted(daily_groups.keys())

    visited = set(progress_data.get("visited_links", []))

    # 4) For each day, deduplicate and crawl
    for day in sorted_days:
        records = daily_groups[day]
        deduped = deduplicate_day(records)
        for record in deduped:
            # Use record[1] as the timestamp
            timestamp = record[1]
            crawl_capture(timestamp, progress_data, visited)

    progress_data["visited_links"] = list(visited)
    save_progress(progress_data)
    print("Done!")

if __name__ == "__main__":
    main()


Total records fetched: 10171
Error fetching HTML for http://web.archive.org/web/20110310030955id_/facebook.com/communitystandards: HTTPConnectionPool(host='web.archive.org', port=80): Max retries exceeded with url: /web/20110310030955id_/facebook.com/communitystandards (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x129d7c050>: Failed to establish a new connection: [Errno 61] Connection refused'))
Error fetching HTML for http://web.archive.org/web/20110310070942id_/facebook.com/communitystandards: HTTPConnectionPool(host='web.archive.org', port=80): Max retries exceeded with url: /web/20110310070942id_/facebook.com/communitystandards (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x129d7c270>: Failed to establish a new connection: [Errno 61] Connection refused'))
Error fetching HTML for http://web.archive.org/web/20110324111851id_/facebook.com/communitystandards: HTTPConnectionPool(host='web.archive.org', port=80): Max retries 