In [None]:
import time
import numpy as np
import urllib.request
import matplotlib.pyplot as plt
import pyautogui
from time import sleep
from PIL import Image
from selenium import webdriver
from IPython.display import clear_output
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [None]:
image_index = None
with open("../data/index.txt", "r") as file:
    image_index = int(file.readline())

In [None]:
def initialize():
    """
    Initialize selenium driver and waiter
    """
    global driver
    global waiter
    driver = webdriver.Chrome('../driver/chromedriver.exe')
    driver.get("https://patrickhlauke.github.io/recaptcha/")
    driver.fullscreen_window()
    waiter = WebDriverWait(driver, 10)
    reclick_checkbox()

def reclick_checkbox():
    """
    Click the captcha checkbox and set the driver's frame to iframe
    Additionally sets the time counter
    """
    global start_time
    start_time = time.time()
    driver.switch_to.default_content()
    pyautogui.click(1920 / 2, 1080 / 2)
    driver.find_element_by_tag_name("iframe").click()
    waiter.until(EC.visibility_of_any_elements_located((By.CSS_SELECTOR, '[title="recaptcha challenge expires in two minutes"]')))
    iframe = driver.find_element_by_css_selector('[title="recaptcha challenge expires in two minutes"]')
    driver.switch_to.frame(iframe)

def get_elapsed_time():
    return time.time() - start_time

def check_table():
    """
    Get the table elements
    """
    table_elements = []
    rows = driver.find_elements_by_tag_name("tr")
    for i in rows:
        columns = i.find_elements_by_tag_name("td")
        for j in columns:
            table_elements.append(j)
    return table_elements  

def reload():
    """
    Refreshes the captcha image
    """
    driver.find_element_by_tag_name("button").click()

def img_element_with_id(id):
    images = driver.find_elements_by_tag_name("img")
    for i in images:
        if i.get_attribute("class") == "rc-image-tile-33":
            return i

# Get the correct captcha size and type (3x3)
def find_3x3_captcha():
    img_element = img_element_with_id("rc-image-tile-33")
    while img_element is None:
        reload()
        sleep(1)
        img_element = img_element_with_id("rc-image-tile-33")
    return check_table()

In [None]:
# IMAGE FUNCTIONS
def download_image():
    """
    Download a single 3x3 image
    """
    urllib.request.urlretrieve(driver.find_element_by_class_name("rc-image-tile-33").get_attribute("src"), "../data/solver/captcha.jpeg")
    return Image.open("../data/solver/captcha.jpeg")

def show_image(img):
    plt.xticks([])
    plt.yticks([])
    plt.imshow(np.array(img))
    plt.show()

def process_image(img):
    """
    Resize the image into a shape of (100, 100, 3)
    """
    if img.mode == "RGBA":
        img = img.convert("RGB")
    img = img.resize((100, 100))
    return img

def crop_image(image: Image):
    """
    Crop 3x3 image into a list of 9 images
    """
    width, height = image.size
    grid_width = width // 3
    grid_height = height // 3
    cropped_images = []
    for i in range(3):
        for j in range(3):
            left = j * grid_width
            upper = i * grid_height
            right = (j + 1) * grid_width
            lower = (i + 1) * grid_height
            cropped = image.crop((left, upper, right, lower))
            cropped_images.append(process_image(cropped))
    return cropped_images

In [None]:
def download_images(n: int):
    """
    Download images from captcha n number of times
    because each captcha is 3x3 that means the total downloaded images
    will be n * 9
    """
    global image_index
    initialize()
    for i in range(n):
        find_3x3_captcha()
        cropped_image = crop_image(download_image())
        for img in cropped_image:
            img.save("../data/images/%d.jpeg" % (image_index))
            image_index += 1
        if get_elapsed_time() > 90:
            reclick_checkbox()
        with open("../data/index.txt", "w") as file:
            file.write(str(image_index))
        reload()
        sleep(0.5)
    driver.close()

In [None]:
# PLEASE ENSURE STABLE INTERNET CONNECTION
while True:
    try:
        download_images(100)
    except Exception:
        driver.close()