In [1]:
import os
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO


In [2]:
def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"Created folder: {folder_name}")


In [3]:
def download_image(url, folder, count):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        image = Image.open(BytesIO(response.content))

        if image.size[0] < 500 or image.size[1] < 500:
            print(f"Skipped small image: {image.size}")
            return False

        image = image.convert("RGB")
        image = image.resize((800, 800))
        path = os.path.join(folder, f"image_{count}.jpg")
        image.save(path)
        print(f"Saved: {path}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False


In [4]:
def scrape_images_bing(query, num_images, folder_name):
    print(f"Searching for: {query}")
    query = query.replace(" ", "+")
    url = f"https://www.bing.com/images/search?q={query}&form=HDRSC2"

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("a", class_="iusc")
    image_urls = []

    for tag in img_tags:
        try:
            m_json = tag.get("m")
            m_url = eval(m_json)["murl"]
            if m_url.startswith("http"):
                image_urls.append(m_url)
        except:
            continue
        if len(image_urls) >= num_images:
            break

    print(f"Found {len(image_urls)} images.")

    count = 0
    for url in image_urls:
        if download_image(url, folder_name, count):
            count += 1
        if count >= num_images:
            break
    print("Done!")


In [5]:
create_folder("dataset/street_pothole")
create_folder("dataset/normal_street")


In [None]:
scrape_images_bing("street pothole", 300, "dataset/street_pothole")
scrape_images_bing("clean city street road", 300, "dataset/normal_street")
