In [4]:
# Import appropriate modules
import os
import time
import requests
from PIL import Image, ImageOps
import io
# Selenium should work better than BeautifulSoup as not a static page and Javascript rendered
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

In [None]:
# Towns and Grid References

town_grid_refs = {
    "medway_gillingham":  ["TQ7966", "TQ7869", "TQ7868", "TQ7769", "TQ7768"],
    "medway_chatham":     ["TQ7762", "TQ7667", "TQ7666", "TQ7567", "TQ7566"],
    "medway_rochester":   ["TQ7469", "TQ7468", "TQ7467", "TQ7369", "TQ7368"],
    "medway_rainham":     ["TQ8267", "TQ8266", "TQ8167", "TQ8166", "TQ8164"],
    "maidstone":          ["TQ7755", "TQ7656", "TQ7655", "TQ7556", "TQ7755"],
    "ashford":            ["TR0142", "TR0141", "TR0043", "TR0042", "TQ9942"],
    "dartford":           ["TQ5675", "TQ5474", "TQ5473", "TQ5374", "TQ5275"],
    "canterbury":         ["TR1558", "TR1557", "TR1458", "TR1457", "TR1456"],
    "gravesend":          ["TQ6574", "TQ6573", "TQ6474", "TQ6473", "TQ6373"],
    "royal_tunbridge_wells": ["TQ5941", "TQ5841", "TQ5840", "TQ5839", "TQ5838"],
    "dover":              ["TR3341", "TR3241", "TR3240", "TR3141", "TR3140"],
    "margate":            ["TR3671", "TR3670", "TR3571", "TR3570", "TR3470"],
    "sittingbourne":      ["TQ9163", "TQ9064", "TQ9063", "TQ8964", "TQ8864"],
    "sevenoaks":          ["TQ5356", "TQ5355", "TQ5354", "TQ5255", "TQ5254"],
    "tonbridge":          ["TQ5947", "TQ5946", "TQ5945", "TQ5846", "TQ5845"]
}

In [None]:
# Function to download and pad images
def download_geograph_images(town, grid_ref, output_folder, max_images=100):
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(options=options)

    os.makedirs(output_folder, exist_ok=True)

    count = 0
    # URLs to attempt, prioritise images with housing label
    url_attempts = [
        f"https://www.geograph.org.uk/browser/#!/q=%5E{grid_ref}/format+%22landscape%22/types+%22Geograph%22/contexts+%22Housing%2C+Dwellings%22/display=plus/pagesize=100",
        f"https://www.geograph.org.uk/browser/#!/q=%5E{grid_ref}/format+%22landscape%22/types+%22Geograph%22/contexts+%22Housing%2C+Dwellings%22/display=plus/pagesize=100/page=2",
        f"https://www.geograph.org.uk/browser/#!/q=%5E{grid_ref}/contexts+-%22Housing%2C+Dwellings%22/format+%22landscape%22/types+%22Geograph%22/display=plus/pagesize=100",
        f"https://www.geograph.org.uk/browser/#!/q=%5E{grid_ref}/contexts+-%22Housing%2C+Dwellings%22/format+%22landscape%22/types+%22Geograph%22/display=plus/pagesize=100/page=2"
    ]

    # Try all URls until max images is achieved
    for url in url_attempts:
        if count >= max_images:
            break

        print(f"\nVisiting: {url}")
        driver.get(url)

        # Wait to ensure content is fully loaded
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "img")))
            time.sleep(2) 

            img_elements = driver.find_elements(By.CSS_SELECTOR, "img")
            for img_element in img_elements:
                if count >= max_images:
                    break

                # Download appropriate images
                try:
                    img_url = img_element.get_attribute("src")

                    if img_url and "geophotos" in img_url:
                        img_data = requests.get(img_url).content
                        image = Image.open(io.BytesIO(img_data)).convert("RGB")

                        # Add padding to make all images 213x160
                        target_size = (213, 160)
                        old_size = image.size  # (width, height)

                        if old_size != target_size:
                            delta_w = target_size[0] - old_size[0]
                            delta_h = target_size[1] - old_size[1]

                            if delta_w >= 0 and delta_h >= 0:
                                padding = (
                                    delta_w // 2,                    # left
                                    delta_h // 2,                    # top
                                    delta_w - (delta_w // 2),        # right
                                    delta_h - (delta_h // 2)         # bottom
                                )
                                image = ImageOps.expand(image, padding, fill=(0, 0, 0))
                            else:
                                # All images should be 213x160 or less so error should not occur
                                print(f"Skipping image (too large): {old_size}")
                                continue

                        # Save padded image
                        filename = os.path.join(output_folder, f"{town}_{grid_ref}_{count + 1}.jpg")
                        image.save(filename)
                        count += 1
                        print(f"Downloaded and padded image {count}: {filename}")

                # Potential error messages (StaleElementReferenceException could suggest retrying)
                except StaleElementReferenceException:
                    print("Skipped stale image")
                    continue
                except Exception as e:
                    print(f"Error processing image: {e}")
                    continue

        except Exception as e:
            print(f"Failed to process URL {url}: {e}")
            continue
    # Closes browser
    driver.quit()

In [None]:
# Download Loop

for town, grid_refs in town_grid_refs.items():
    for grid_ref in grid_refs:
        output_folder = f"E:/geograph_dataset/{town}/{grid_ref}"
        print(f"\nStarting to download images for {town} - {grid_ref}")
        download_geograph_images(town, grid_ref, output_folder, max_images=150)

In [11]:
# Folder where images are stored
folder = "E:/geograph_dataset"

# Supported image file extensions
image_extensions = ('.jpg')

# Dictionary to hold counts
image_counts = {}

# Obtain a count for each town and grid_ref folder
for town in os.listdir(folder):
    town_path = os.path.join(folder, town)
    if not os.path.isdir(town_path):
        continue

    for grid_ref in os.listdir(town_path):
        grid_path = os.path.join(town_path, grid_ref)
        if not os.path.isdir(grid_path):
            continue

        num_images = len([
            f for f in os.listdir(grid_path)
            if f.lower().endswith(image_extensions)
        ])

        key = f"{town}_{grid_ref}"
        image_counts[key] = num_images

# Print the results sorted by town name
for key in sorted(image_counts.keys()):
    print(f"{key}: {image_counts[key]} image(s)")

ashford_TQ9942: 97 image(s)
ashford_TR0042: 136 image(s)
ashford_TR0043: 33 image(s)
ashford_TR0141: 66 image(s)
ashford_TR0142: 126 image(s)
canterbury_TR1456: 113 image(s)
canterbury_TR1457: 150 image(s)
canterbury_TR1458: 150 image(s)
canterbury_TR1557: 150 image(s)
canterbury_TR1558: 150 image(s)
dartford_TQ5275: 92 image(s)
dartford_TQ5374: 49 image(s)
dartford_TQ5473: 150 image(s)
dartford_TQ5474: 150 image(s)
dartford_TQ5675: 84 image(s)
dover_TR3140: 150 image(s)
dover_TR3141: 150 image(s)
dover_TR3240: 147 image(s)
dover_TR3241: 150 image(s)
dover_TR3341: 95 image(s)
gravesend_TQ6373: 105 image(s)
gravesend_TQ6473: 150 image(s)
gravesend_TQ6474: 45 image(s)
gravesend_TQ6573: 47 image(s)
gravesend_TQ6574: 137 image(s)
maidstone_TQ7556: 115 image(s)
maidstone_TQ7655: 150 image(s)
maidstone_TQ7656: 150 image(s)
maidstone_TQ7755: 131 image(s)
margate_TR3470: 150 image(s)
margate_TR3570: 150 image(s)
margate_TR3571: 98 image(s)
margate_TR3670: 67 image(s)
margate_TR3671: 86 image(s

It appears as though a few grid references did not download completely.

In [12]:
# Create retry dictionary for folders under 100 images
retry_grid_refs = {}

for key, count in image_counts.items():
    if count < 100:
        town, grid_ref = key.rsplit('_', 1)
        retry_grid_refs.setdefault(town, []).append(grid_ref)

# Show the dictionary
for town, refs in retry_grid_refs.items():
    print(f"{town}: {refs}")

ashford: ['TQ9942', 'TR0043', 'TR0141']
dartford: ['TQ5275', 'TQ5374', 'TQ5675']
dover: ['TR3341']
gravesend: ['TQ6474', 'TQ6573']
margate: ['TR3571', 'TR3670', 'TR3671']
medway_chatham: ['TQ7566', 'TQ7666', 'TQ7762']
medway_rainham: ['TQ8164', 'TQ8167']
sevenoaks: ['TQ5355']
sittingbourne: ['TQ8864', 'TQ8964', 'TQ9163']


In [None]:
# Clean folders
for town, grid_refs in retry_grid_refs.items():
    for grid_ref in grid_refs:
        output_folder = f"E:/geograph_dataset/{town}/{grid_ref}"

        # Clean the folder by deleting each file
        if os.path.exists(output_folder):
            print(f"Cleaning existing folder: {output_folder}")
            for filename in os.listdir(output_folder):
                file_path = os.path.join(output_folder, filename)
                if os.path.isfile(file_path):
                    os.remove(file_path)
                    
        # Redownload images 
        print(f"\nRetrying download for {town} - {grid_ref}")
        download_geograph_images(town, grid_ref, output_folder, max_images=150)

In [18]:
# Dictionary to hold counts
new_image_counts = {}

# Obtain a count for each town and grid_ref folder
for town in os.listdir(folder):
    town_path = os.path.join(folder, town)
    if not os.path.isdir(town_path):
        continue

    for grid_ref in os.listdir(town_path):
        grid_path = os.path.join(town_path, grid_ref)
        if not os.path.isdir(grid_path):
            continue

        num_images = len([
            f for f in os.listdir(grid_path)
            if f.lower().endswith(image_extensions)
        ])

        key = f"{town}_{grid_ref}"
        new_image_counts[key] = num_images

# Print the results sorted by town name
for key in sorted(new_image_counts.keys()):
    print(f"{key}: {new_image_counts[key]} image(s)")

# Create a low dictionary for folders under 100 images
low_grid_refs = {}

for key, count in new_image_counts.items():
    if count < 100:
        town, grid_ref = key.rsplit('_', 1)
        low_grid_refs.setdefault(town, []).append(grid_ref)

# Show the dictionary
for town, refs in low_grid_refs.items():
    print(f"{town}: {refs}")

ashford_TQ9942: 97 image(s)
ashford_TR0042: 136 image(s)
ashford_TR0043: 128 image(s)
ashford_TR0141: 150 image(s)
ashford_TR0142: 126 image(s)
canterbury_TR1456: 113 image(s)
canterbury_TR1457: 150 image(s)
canterbury_TR1458: 150 image(s)
canterbury_TR1557: 150 image(s)
canterbury_TR1558: 150 image(s)
dartford_TQ5275: 92 image(s)
dartford_TQ5374: 120 image(s)
dartford_TQ5473: 150 image(s)
dartford_TQ5474: 150 image(s)
dartford_TQ5675: 84 image(s)
dover_TR3140: 150 image(s)
dover_TR3141: 150 image(s)
dover_TR3240: 147 image(s)
dover_TR3241: 150 image(s)
dover_TR3341: 150 image(s)
gravesend_TQ6373: 105 image(s)
gravesend_TQ6473: 150 image(s)
gravesend_TQ6474: 150 image(s)
gravesend_TQ6573: 100 image(s)
gravesend_TQ6574: 137 image(s)
maidstone_TQ7556: 115 image(s)
maidstone_TQ7655: 150 image(s)
maidstone_TQ7656: 150 image(s)
maidstone_TQ7755: 131 image(s)
margate_TR3470: 150 image(s)
margate_TR3570: 150 image(s)
margate_TR3571: 150 image(s)
margate_TR3670: 67 image(s)
margate_TR3671: 86 

It does not appear as though the number of images for any other grid will change.