# This crawler script can retrieve the tank images for YOLO model training
_i.e. The training data for object detection model in WEASEL project_

The images are crawled from following sources:
* Getty Images
* Unsplash
* Alamy
* iStock
* Google image search

**Note: the retrieved images here had been only used for model training, no guarantee for any usage's legility.**


---

### First part, dependencies installation and package loading
Makr sure you've downloaded a proper selenium web driver


* Driver links can be found at [selenium official website](https://pypi.org/project/selenium/)
* Remember to download the same browser with your webdriver
    * The code below uses [geckodriver](https://github.com/mozilla/geckodriver/releases) for [firefox](https://www.mozilla.org/en-GB/firefox/new/)  
    _Go downloading the webdriver and the browser using the links provided above_

In [None]:
# !pip install selenium

import random
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
# If your browser is chrome:
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
import urllib.request
from urllib.request import urlretrieve
from selenium.webdriver.common.by import By

# Download the GeckoDriver / ChromeDriver if you didn't do it manually
driver = webdriver.Firefox(executable_path = GeckoDriverManager().install())
# driver = webdriver.Chrome(ChromeDriverManager().install())

#### Getty images crawling

In [None]:
# Customised options can be set up here
options = Options()
options.add_argument('--disable-notifications')

# Using Firefox driver (geckodriver)
driver = webdriver.Firefox(options = options)
# driver = webdriver.Chrome('./chromedriver', chrome_options = options)

# crawling getty images website (you can change the url below to retrieve the contents from other websites)
url = 'https://www.gettyimages.hk/%E5%9C%96%E7%89%87/chinese-military-tank'
driver.get(url)

# get the page numbers
total_pages_class = 'JO4Dw2C5EjCB3iovKUcw'
total_pages_num = driver.find_elements(By.CLASS_NAME, total_pages_class)

# get the 'next page' button (somehow not work)
# next_page = 'Npj3TMjwvq4A76qbyQTN EBqj1Iclpc8AgCExC4KG'
# next_page_button = driver.find_elements(By.CLASS_NAME, next_page)
# print(total_pages_num[0].text, next_page_button)

#=== looping through every image element in each page: ===
for page in range(int(total_pages_num[0].text)):

    # specify the current page crawler at
    driver.get(f'{url}?page={str(page + 1)}')
    # driver.get(url + '?page=%s' %str(page + 1))
    # print(f'current page is: {str(page + 1)}')

    # getting the elements via class name
    pla_images_class = 'BLA_wBUJrga_SkfJ8won'
    imgs = driver.find_elements(By.CLASS_NAME, pla_images_class)

    # collecting the targeted elements
    src = []
    for img in imgs:
        src.append(img.get_attribute('src'))

    # download the images through their url
    for i in range(len(src)):
        urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 300))}{str(random.randint(301, 600))}pla_vehicle_{str(i)}.jpg')
        time.sleep(3)

# Close the browser when done
driver.quit()

### Unsplash crawling
The code is almost the same with the above one, so it's just a demonstration for image retrieval on customised website

In [None]:
# Customised options can be set up here
options = Options()
options.add_argument('--disable-notifications')

# Use Firefox driver (geckodriver)
driver = webdriver.Firefox(options = options)
# driver = webdriver.Chrome('./chromedriver', chrome_options = options)

url = 'https://unsplash.com/s/photos/pla-tank?license=free'
driver.get(url)

# get the 'load more' button and click it
load_more_class = 'ZGh7S kx6eK x_EXo R6ToQ QcIGU l0vpf a_AEd ncszm MCje9 daPLj R6ToQ'
# select the specific button with certain texts
load_more_button = driver.find_element(By.XPATH, '//button[@type="button" and contains(text(), "Load more")]')
time.sleep(1)
load_more_button.click()

# scroll down the page to load more images
scroll_times = 7
for i in range(scroll_times):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    time.sleep(1)

# time.sleep(scroll_times)

# select images by their class
# pla_images_class = 'I7OuT DVW3V L1BOa'
imgs = driver.find_elements(By.XPATH, '//img[@class="I7OuT DVW3V L1BOa"]')
# print(imgs)

src = []
for img in imgs:
    src.append(img.get_attribute('src'))
# print(src)

for i in range(len(src)):
    urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 300))}{str(random.randint(301, 600))}pla_vehicle_unsplash{str(i)}.jpg')
    time.sleep(3)


# Close the browser when done
driver.quit()

### Crawling Alamy, a spider-unfriendly website
The code below has been modulised into functions, so you can easily change some of it to retreive the contents from other websites

In [None]:
# import an exception handler
from selenium.common.exceptions import NoSuchElementException

# Customised options can be set up here
options = Options()
options.add_argument('--disable-notifications')

# Use Firefox driver (geckodriver)
driver = webdriver.Firefox(options = options)
# driver = webdriver.Chrome('./chromedriver', chrome_options = options)

url = 'https://www.alamy.com/stock-photo/tank.html?pseudoid=AFCA922984CF4BE1B800DEA01EE3769C&sortBy=relevant'
driver.get(url)

time.sleep(1)


total_pages_num = driver.find_element(By.XPATH, '//span[@class="inline-block pr-2 text-xs"]')
total_pages_num = int(total_pages_num.text.replace('(', '').replace(')', '')) // 100

# scroll down the page to load more images
# If doing it all at once, the contents in between top and bottom won't be loaded -> divide the move into 8 parts
def scroll_down():
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight / 8);')
        # To overcome the lazy loading issue, sleep 1.8 second to ensure all the images have been loaded before further scrolling down
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 2/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 3/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 4/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 5/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 6/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight * 7/8);')
        time.sleep(1.8)
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')


# A cookie setting page may pop up to block the crawler, so we have to handle it in case
# decline all cookies
def skip_notification():
    # choose the preference button
    driver.find_element(By.XPATH, '//button[contains(text(), "Preferences")]').click()
    time.sleep(1)
    # decline all
    driver.find_element(By.XPATH, '//button[contains(text(), "Decline All")]').click()

def next_page():
    # select the specific button with certain texts
    load_more_button = driver.find_element(By.XPATH, '//a[contains(text(), "Next page")]')
    load_more_button.click()

def fetch_images(src):
    # download the images through their url
    for i in range(len(src)):
        urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 300))}{str(random.randint(301, 600))}alamy_pla_vehicle_{str(i)}.jpg')
        time.sleep(3)


for i in range(total_pages_num):
    scroll_down()
    try:
        # if a notification pops up -> call the function to cope with it
        skip_notification()
    except NoSuchElementException:
        pass
    time.sleep(3)

    # image retrieval
    imgs = driver.find_elements(By.XPATH, '//img[@class="bg-grey-light absolute object-cover"]')
    src = []
    for img in imgs:
        src.append(img.get_attribute('src'))
    fetch_images(src)
    if(i + 1 < total_pages_num):
        next_page()

driver.quit()


### iStock images retrieval

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Customised options can be set up here
options = Options()
options.add_argument('--disable-notifications')

# Use Firefox driver (geckodriver)
driver = webdriver.Firefox(options = options)
# driver = webdriver.Chrome('./chromedriver', chrome_options = options)

url = 'https://www.istockphoto.com/photos/german-tank'
driver.get(url)

# get the page numbers
total_pages_num = driver.find_elements(By.XPATH, '//span[@class="EEuNOdJESEP1DykbrGuZ"]')
# next_page_button = driver.find_elements(By.XPATH, '//a[@class="sgteZ8IeHi_1YN4npuIQ LKeCllp5T38Hwu_jsBoh"]')



for page in range(int(total_pages_num[0].text)):
    driver.get(f'{url}?page={str(page + 1)}')

    # scroll down the page to load more images
    scroll_times = 3
    for i in range(scroll_times):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(1)

    # time.sleep(scroll_times)

    # select images by their class
    # pla_images_class = 'I7OuT DVW3V L1BOa'
    imgs = driver.find_elements(By.XPATH, '//img[@class="bOaTkZcdqgXxzJCZECTz"]')
    # print(imgs)

    src = []
    for img in imgs:
        src.append(img.get_attribute('src'))
    # print(src)

    for i in range(len(src)):
        urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 1000))}{str(random.randint(1001, 3000))}pla_vehicle_unsplash{str(i)}.jpg')
        time.sleep(3)

# Close the browser when done
driver.quit()

### Google image search

In [None]:
# Customised options can be set up here
options = Options()
options.add_argument('--disable-notifications')

# Use Firefox driver (geckodriver)
driver = webdriver.Firefox(options = options)
# driver = webdriver.Chrome('./chromedriver', chrome_options = options)

url = 'https://www.google.com/search?client=firefox-b-d&sca_esv=cd7e98c4db4a6188&q=russian+tank&udm=2&fbs=AEQNm0AaBOazvTRM_Uafu9eNJJzC3QMRKTS5UIeA1ZwBo3sfIyX5TcvV-swlvSxxjUWCqHappJZq9wnO1J5HgIdDMDYVF4579BCY1rEJ48RGTNr_B0aeIAe_AfRPK0tFzOwvZho25HBsBhTZ4Sxz3rSOTEHos0geP6XlyjkilBZZaqJLvIF2UnMjWS1h1IQqyGhTg4rnY2u4&sa=X&ved=2ahUKEwiAuaWRkZCKAxWLjK8BHVAPEUoQtKgLegQIGBAB&biw=1920&bih=927'
driver.get(url)

# scroll down the page to load more images
scroll_times = 7
for i in range(scroll_times):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    time.sleep(1)

# time.sleep(scroll_times)

# select images by their class
# pla_images_class = 'I7OuT DVW3V L1BOa'
imgs = driver.find_elements(By.XPATH, '//img[@class="YQ4gaf" and not(@class="zr758c")]')
# print(imgs)

src = []
for img in imgs:
    src.append(img.get_attribute('src'))
# print(src)

for i in range(len(src)):
    urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 300))}{str(random.randint(301, 600))}google_images{str(i)}.jpg')
    time.sleep(3)

# ======== other search results (since one key words will only yield limited related images) ========
url = 'https://www.google.com/search?q=pla+tank&client=firefox-b-d&sca_esv=92dcb7bc0db55c59&udm=2&biw=1920&bih=927&ei=KkBWZ7mPM_mN2roPmMDM-AU&ved=0ahUKEwi5zPakvZmKAxX5hlYBHRggE18Q4dUDCBA&uact=5&oq=pla+tank&gs_lp=EgNpbWciCHBsYSB0YW5rMgUQABiABDIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIGEAAYCBgeMgYQABgIGB4yBhAAGAUYHjIGEAAYBRgeSMk9UOAEWOc2cAZ4AJABAJgBKKAB3QKqAQIxMbgBA8gBAPgBAZgCD6AC_AKoAgDCAgsQABiABBixAxiDAcICCBAAGIAEGLEDwgIGEAAYBxgewgIOEAAYgAQYsQMYgwEYigXCAg0QABiABBixAxiDARgKmAMBiAYBkgcCMTWgB6ga&sclient=img'
driver.get(url)

# scroll down the page to load more images
scroll_times = 50
for i in range(scroll_times):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    time.sleep(1)

# time.sleep(scroll_times)

# select images by their class
# pla_images_class = 'I7OuT DVW3V L1BOa'
imgs = driver.find_elements(By.XPATH, '//img[@class="YQ4gaf" and not(@class="zr758c")]')
# print(imgs)

src = []
for img in imgs:
    src.append(img.get_attribute('src'))
# print(src)

for i in range(len(src)):
    urllib.request.urlretrieve(str(src[i]), f'{str(random.randint(0, 300))}{str(random.randint(301, 600))}google_pla_images{str(i)}.jpg')
    time.sleep(3)

# Close the browser when done
driver.quit()

---
### Afterwards, we can perform the image augumentation
The code below will horizontally flip the images

---

In [None]:
# make sure openCV has been installed
# !pip install opencv-python

import cv2
import os

In [None]:

# setting the directory containing images
input_directory = './' # pwd
# assigning a directory for processed images
output_directory = './flipped_images'

# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    # Check for image files
    if filename.endswith('.jpg') or filename.endswith('.png'):
        # Construct full file path (directory + file name)
        img_path = os.path.join(input_directory, filename)
        
        # Read the image
        img = cv2.imread(img_path)
        
        # Flip the image horizontally
        flipped_img = cv2.flip(img, 1) # 1 indicates horizontal flip
        
        #You can conduct other augumentation you wish here

        # Construct output file path
        output_path = os.path.join(output_directory, f'{os.path.splitext(filename)[0]}_flip.jpg')
        
        # Save the flipped image
        cv2.imwrite(output_path, flipped_img)

print('Completed')

---

### Other image sources
[This is the stock we labelled (manually and automatically)](https://app.roboflow.com/weasel/tanks-awegp/), _the images are retrieved using the code above_

Apart from the stock we built, we also used some public datasets with labels from Roboflow
* [labelled stock](https://universe.roboflow.com/test-dk4a0/tanks-li2dj-2q89d)
* [another labelled stock](https://universe.roboflow.com/ml-intern-2023/tank-idqsj)
