In [86]:
%pip install requests ddgs pandas

Note: you may need to restart the kernel to use updated packages.


In [87]:
from ddgs import DDGS 
import requests
import os
import pandas as pd

In [88]:
save_folder = "downloaded_images/"
os.makedirs(save_folder, exist_ok=True)

def download_images(query, max_results=5):
    success = 0
    with DDGS() as ddgs:
        results = ddgs.images(
            query=query, 
            max_results=max_results
        )

        for i, img in enumerate(results):
            url = img['image']
            
            # Skip webp images by checking URL
            if url.lower().endswith('.webp'):
                print(f'Skipped webp: {url}')
                continue
                
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                
                # Skip webp images by checking Content-Type
                content_type = response.headers.get('Content-Type', '')
                if 'webp' in content_type.lower():
                    print(f'Skipped webp (content-type): {url}')
                    continue
                
                filename = f'{save_folder} {query.replace(" ", "_")}_{i+1}.jpg'

                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f'Downloaded: {filename}')
                success += 1
            except requests.exceptions.RequestException as e:
                print(f'Failed to download {url}: {e}')

    print(f'Total images downloaded for "{query}": {success} {max_results}')

# download_images("Hanging Bridge in Hagonoy", max_results=5)


Load the CSV file into a DataFrame

- Take only the 'name' and 'municipality' columns
- Turn it into a list of tuples
- Make a function that will return this: f{name} in {municipality}


In [89]:
df = pd.read_csv('poi.csv')
df = df[['name', 'municipality']]
df_dict = df.to_dict(orient='records')
df_dict

def generate_search_queries(poi_list):
    queries = []
    for poi in poi_list:
        name = poi['name']
        municipality = poi['municipality']
        query = f"{name} in {municipality} Bulacan"
        queries.append(query)
    return queries

search_queries = generate_search_queries(df_dict)

In [90]:
size = len(search_queries)
print(f"Total search queries: {size}")
print(f'Possible total images to download: {size * 10}')
print(f'Estimated total download size: {size * 10 * 500 / 1024:.2f} MB')  # Assuming average image size of 500KB

Total search queries: 94
Possible total images to download: 940
Estimated total download size: 458.98 MB


Let the scraping commence!

In [91]:
from IPython.display import clear_output

for idx, query in enumerate(search_queries, 1):
    print()
    print(f"Progress: {idx} / {size} - Processing: {query}\r")
    print()
    download_images(query, 10)
    clear_output(wait=True)
    


Progress: 94 / 94 - Processing: Mt. Balagbag in San Jose Del Monte Bulacan

Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_1.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_2.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_3.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_4.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_5.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_6.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_7.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_8.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_9.jpg
Downloaded: downloaded_images/ Mt._Balagbag_in_San_Jose_Del_Monte_Bulacan_10.jpg
Total images downloaded for "Mt. Balagbag in San Jose Del Monte Bulacan": 10 10
