In [None]:
import os
os.environ['PATH'] += ":/usr/bin/chromedriver"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install opencv-python-headless requests beautifulsoup4 selenium webdriver-manager

In [None]:
import os
import time 
import requests
from bs4 import BeautifulSoup
import cv2
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


def download_and_save_image(url, path):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img_array = np.array(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        if img is not None:
            cv2.imwrite(path, img)
            print(f"Downloaded and saved: {path}")
            return True
        else:
            print(f"Failed to convert image: {url}")
    except Exception as e:
        print(f"Could not download {url}: {e}")
    return False


def download_images(query, num_images, output_dir, checkpoint_file):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Directory created: {output_dir}")

    
    downloaded_urls = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            downloaded_urls = set(line.strip() for line in file)

    driver = init_driver()
    search_url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch&hl=en&tbs=il:cl"
    driver.get(search_url)
    time.sleep(2)  

   
    body = driver.find_element(By.TAG_NAME, "body")
    for _ in range(50):  # Increase scroll times to load more images
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(1)

    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

   
    images = soup.find_all('img')
    print(f"Found {len(images)} images for query '{query}'")

    urls = []
    for img in images:
        try:
            url = img['src']
        except KeyError:
            url = img.get('data-src')
        if url and url.startswith('http') and url not in urls and url not in downloaded_urls:
            urls.append(url)
        if len(urls) >= num_images:
            break

    print(f"Found {len(urls)} URLs for query '{query}'")

   
    with open(os.path.join(output_dir, f"{query}_urls.txt"), 'w') as file:
        for url in urls:
            file.write(f"{url}\n")

    
    for i, url in enumerate(urls):
        if url in downloaded_urls:
            continue
        file_path = os.path.join(output_dir, f"{query}_{i+1}.jpg")
        if download_and_save_image(url, file_path):
            # Update checkpoint
            with open(checkpoint_file, 'a') as file:
                file.write(f"{url}\n")
            downloaded_urls.add(url)


search_queries = {
"animals": [
        "animals in war", "war dogs", "dead animals",
        "injured animals", "war horses", "military animals",
        "service animals in war", "war pigeons", "war elephants",
        "animal casualties in war", "war animal heroes", "animals aiding soldiers",
        "military working dogs", "animals used in war", "animal mascots in war",
        "animals in conflict zones", "war animal rescues", "animals in war history"
    ],
    "air force": [
        "air force planes", "military aircraft", "fighter jets",
        "bombers", "air combat", "air force base",
        "military aviation", "air force training", "aerial maneuvers",
        "air force pilots", "air force operations", "air strikes",
        "air force jets", "military helicopters", "air force missions",
        "air force exercises", "air force squadrons", "air defense"
    ],
    "navy": [
        "navy ships", "navy war scenes", "submarines",
        "aircraft carriers", "naval battles", "naval operations",
        "naval exercises", "navy fleets", "naval warfare",
        "navy sailors", "navy base", "naval maneuvers",
        "navy training", "naval strategy", "naval engagements",
        "naval blockade", "naval patrols", "naval rescues"
    ],
    "war technology": [
        "war technology", "military technology", "war equipment",
        "guns", "atomic bomb", "military strategy",
        "war maps", "military satellite", "military drones",
        "military vehicles", "war robots", "advanced weaponry",
        "military communications", "war surveillance", "military logistics",
        "cyber warfare", "military innovation", "war gadgets"
    ],
    "protests and anti-war movements": [
        "anti-war protests", "peace movements", "anti-war demonstrations",
        "war protest signs", "peace rallies", "war opposition",
        "anti-war activists", "anti-war campaigns", "war dissent",
        "protest marches", "anti-war slogans", "anti-war gatherings",
        "peace advocacy", "anti-war resistance", "protest against war",
        "peace activism", "war critics", "anti-war speeches"
    ],
    "propaganda and symbolism": [
        "war propaganda", "war symbolism", "military posters",
        "wartime propaganda", "patriotic posters", "propaganda leaflets",
        "propaganda art", "wartime symbols", "military insignia",
        "war emblems", "propaganda campaigns", "nationalist propaganda",
        "war slogans", "patriotic symbols", "military propaganda",
        "propaganda posters", "wartime imagery", "war propaganda techniques"
    ],
    "peace treaty": [
        "peace treaty signing", "war peace treaty", "happy people at the end of the war",
        "ceasefire agreement", "peace negotiations", "peace accords",
        "peace agreement", "treaty of peace", "war peace talks",
        "peace ceremonies", "war armistice", "end of war celebrations",
        "peace declarations", "post-war peace", "peace diplomacy",
        "peace accords signing", "war truce", "peaceful resolution"
    ],
    "post-war": [
        "post-war reconstruction", "life after war", "regime changes",
        "starvation", "economic crisis", "rebuilding efforts",
        "post-war recovery", "post-war economy", "post-war politics",
        "post-war society", "war reconstruction projects", "post-war rebuilding",
        "war recovery efforts", "post-war challenges", "post-war infrastructure",
        "war legacy", "post-war healing", "post-war support"
    ],
    "memorials and remembrance": [
        "war memorials", "remembrance of war", "monuments",
        "war cemeteries", "war commemorations", "memorial services",
        "war tribute", "war memorial sites", "war anniversary",
        "war memorial ceremonies", "remembrance day", "war memorial statues",
        "war memorial events", "fallen soldiers memorial", "war heroes tribute"
    ],
}



for category, queries in search_queries.items():
    output_dir = os.path.join("/content/drive/MyDrive/data_set", category)
    checkpoint_file = os.path.join(output_dir, "checkpoint.txt")
    for query in queries:
        print(f"Downloading images for query: {query}")
        download_images(query, 1000, output_dir, checkpoint_file)

print("Download completed.")