## Import packages + local image directory

In [28]:
import os
import requests
import base64
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from PIL import Image
from io import BytesIO
from urllib.parse import urljoin
from PIL import Image
from hashlib import md5

# Directory for saving images
save_directory = "C:\\Users\\KevinsAcer\\OneDrive - North Carolina State University\\CrosswordProject\\Images\\Connections"
os.makedirs(save_directory, exist_ok=True)

## Define dates and other variables

In [26]:
date_start = datetime(2024, 11, 28)
date_end = datetime(2024, 12, 26)

base_url = # URL removed to protect the host website from additional traffic

## Loop for pulling images and saving to local directory

A couple major challenges in this portion of the project:

- The image tags were highly specific (hundreds to thousands of characters long) and occassionally changed.
    * This neccessitated several re-writes of the code in order to adapt to the website's changes.
- Some of the images were encoded in base64 (presumably for faster loading).
    * This necessitated converting the images to RGB encoding before they could be processed.
- The complexity of the challenges resulted in the use of several libraries.
    * This necessitated careful coding to ensure each library was passing information to the others in a usable way.


Additional notes:
- Inspection of the website's HTML revealed that the Connections images were larger than 500 pixels wide, so images narrower than that are screened in the "Check image dimensions and save" step.
- Files are saved as "YYYY-MM-DD_n", where n represents the nth image found on a particular day.


In [27]:
# Loop through each day in the date range
current_date = date_start
while current_date <= date_end:
    # Format the date as "month-day-year", removing leading zeros from the day
    date_str = current_date.strftime("%B-%d-%Y").lower().replace(" ", "-").replace("-0", "-")
    url = base_url.format(date_str)

    try:
        # Make a request to the URL
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content using Beautiful Soup
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Find all <img> tags
        images = soup.find_all('img')

        # List to store image URLs
        image_urls = []
        png_count = 0

        # Extract the 'src' attribute and resolve relative URLs
        for img in images:
            img_src = img.get('src')
            if img_src:
                img_url = urljoin(url, img_src)
                image_urls.append(img_url)
                if "data:image/png" in img_src:
                    png_count += 1
                if png_count >= 2:
                    break

        # Process and save each image
        for i, img_url in enumerate(image_urls):
            try:
                raw_data = None
                image_data = None
                image = None

                if img_url.startswith('data:image/png'):  # Handle data URLs
                    header, encoded = img_url.split(',', 1)
                    try:
                        print(f"Processing data URL, encoded data length: {len(encoded)}")
                        raw_data = base64.b64decode(encoded)
                    except (ValueError, AttributeError) as e:
                        print(f"Error decoding base64 data: {e}")
                        continue
                else:  # Handle standard URLs
                    try:
                        img_response = requests.get(img_url)
                        img_response.raise_for_status()
                        raw_data = img_response.content
                        print(f"Processing standard URL, raw data size: {len(raw_data)} bytes")
                    except (requests.exceptions.RequestException, AttributeError) as e:
                        print(f"Error retrieving image from standard URL: {e}")
                        continue

                # Ensure raw_data is valid
                if not raw_data or len(raw_data) == 0:
                    print("Raw data is invalid or empty.")
                    continue

                # Initialize BytesIO and Image objects
                try:
                    image_data = BytesIO(raw_data)
                    image = Image.open(image_data) 
                    print(f"Image loaded successfully: Mode={image.mode}, Size={image.size}")
                except Exception as e:
                    print(f"Error initializing image object: {e}")
                    with open("debug_invalid_image.png", "wb") as debug_file:
                        debug_file.write(raw_data)
                    print("Saved invalid image data for inspection: debug_invalid_image.png")
                    continue

                # Convert image to RGB if necessary
                if image.mode != 'RGB':
                    image = image.convert('RGB')

                # Check image dimensions and save
                if image.width >= 500:
                    unique_suffix = md5(img_url.encode()).hexdigest()[:8]  # Unique filename
                    save_path = os.path.join(save_directory, f"{current_date.strftime('%Y-%m-%d')}_{i+1}.png")
                    print(f"Attempting to save: {save_path}")
                    try:
                        image.save(save_path)
                        if os.path.exists(save_path):
                            print(f"Image saved successfully: {save_path}")
                        else:
                            print(f"Failed to save image: {save_path}")
                    except Exception as e:
                        print(f"Error saving image: {e}")
                else:
                    print(f"Image skipped due to insufficient width: {image.width}px")

            except Exception as e:
                print(f"Unexpected error processing image {img_url}: {e}")
                continue

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve data for {current_date.strftime('%Y-%m-%d')}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for {current_date.strftime('%Y-%m-%d')}: {e}")

    # Move to the next day
    current_date += timedelta(days=1)


Processing standard URL, raw data size: 6112 bytes
Image loaded successfully: Mode=RGBA, Size=(198, 45)
Image skipped due to insufficient width: 198px
Processing data URL, encoded data length: 15524
Image loaded successfully: Mode=RGBA, Size=(628, 346)
Attempting to save: C:\Users\KevinsAcer\OneDrive - North Carolina State University\CrosswordProject\Images\TEST\2024-11-28_2.png
Image saved successfully: C:\Users\KevinsAcer\OneDrive - North Carolina State University\CrosswordProject\Images\TEST\2024-11-28_2.png
Processing data URL, encoded data length: 21892
Image loaded successfully: Mode=RGBA, Size=(628, 348)
Attempting to save: C:\Users\KevinsAcer\OneDrive - North Carolina State University\CrosswordProject\Images\TEST\2024-11-28_3.png
Image saved successfully: C:\Users\KevinsAcer\OneDrive - North Carolina State University\CrosswordProject\Images\TEST\2024-11-28_3.png
Processing standard URL, raw data size: 6112 bytes
Image loaded successfully: Mode=RGBA, Size=(198, 45)
Image skipped