This notebook's purpose is to scrape an image of each animal and each dinosaur.

In [10]:
# Importing Selenium, the main library that will be used for web scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
# Opens up a window
driver = webdriver.Chrome()

In [54]:
# Imports pandas since the needed animal/dino names are in csv's
import pandas as pd

In [73]:
# Function to search for something and download the first picture.
# Assumes that a previous search has already happened and that the images
# tab is already open, so that needs to be done manually the first time
# Will occasionally stop and give an error because a captcha pops up, in which
# case the captcha needs to be done manually and the loop through all the animal
# names will need to be manually started at wherever it left off.
def download_image_faster(animal):
    # Scrolls back up to the top of the page to ensure search bar is in biew
    ActionChains(driver)\
        .scroll_by_amount(0, -100)\
        .perform()
    # Finds search bar
    search_bar = driver.find_element(By.CSS_SELECTOR, "#APjFqb")
    # Clears previous search
    search_bar.clear()
    # Clicks into search bar
    search_bar.click()
    # Types organism name and presses enter to search
    actions = ActionChains(driver)
    actions.send_keys(animal)
    actions.send_keys(Keys.RETURN)
    actions.perform()
    # Finds the first image
    image = driver.find_element(By.CSS_SELECTOR, "#rso > div > div > div.wH6SXe.u32vCb > div > div > div:nth-child(1) > div.czzyk.XOEbc > h3 > a")
    # Scrolls down to ensure whole image is in view
    ActionChains(driver)\
        .scroll_by_amount(0, 100)\
        .perform()
    # Creates a new png and writes the screenshot to it
    with open(animal + ".png", "wb") as file:
        file.write(image.screenshot_as_png)

In [83]:
# Reads in the dinosaurs dataset and downloads an image for every dinosaur
# Data is from https://www.kaggle.com/datasets/kjanjua/jurassic-park-the-exhaustive-dinosaur-dataset
# but with added information about weight
dinos = pd.read_csv("jurassicparkwithweights.csv")
for name in names:
    download_image_faster(name)

In [84]:
# Reads in the animals dataset and downloads an image for every animal
animals = pd.read_csv("Animal-Info.csv")
animal_names = list(animals['Animal'])
for ani in animal_names:
    download_image_faster(ani)

Unnamed: 0,Animal,Height (cm),Weight (kg),Color,Lifespan (years),Diet,Habitat,Predators,Average Speed (km/h),Countries Found,Conservation Status,Family,Gestation Period (days),Top Speed (km/h),Social Structure,Offspring per Birth
0,Aardvark,105-130,40-65,Grey,20-30,Insectivore,"Savannas, Grasslands","Lions, Hyenas",40,Africa,Least Concern,Orycteropodidae,210-240,40,Solitary,1
1,Aardwolf,40-50,8-14,Yellow-brown,10-12,Insectivore,"Grasslands, Savannas","Lions, Leopards",24-30,Eastern and Southern Africa,Least Concern,Hyaenidae,90,40,Solitary,2-5
2,African Elephant,270-310,2700-6000,Grey,60-70,Herbivore,"Savannah, Forest","Lions, Hyenas",25,Africa,Vulnerable,Elephantidae,640-660,40,Herd-based,1
3,African Lion,80-110,120-250,Tan,10-14,Carnivore,"Grasslands, Savannas","Hyenas, Crocodiles",58,Africa,Vulnerable,Felidae,98-105,80,Group-based,2-4 (usually)
4,African Wild Dog,75-80,18-36,Multicolored,10-12,Carnivore,Savannahs,"Lions, Hyenas",56,Sub-Saharan Africa,Endangered,Canidae,70,56,Group-based,10-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,Yak,140-160,500-1200,"Brown, Black",20-25,Herbivore,Mountains,"Snow Leopards, Wolves",24,"Himalayas, Central Asia",Least Concern,Bovidae,215-280,24,Group-based,10-50
201,Yellow-Eyed Penguin,60-65,1-3,"Yellow, White",Up to 20,Carnivore,Coastal Areas,"Seals, Orcas",25,New Zealand,Endangered,Spheniscidae,80-90,25,Solitary,1
202,Yeti Crab,Up to 15,Up to 0.5,"White, Hairy",Up to 20,Omnivore,Hydrothermal Vents,Not Applicable,Not Applicable,Pacific Ocean,Not Evaluated,Kiwaidae,Not Applicable,Not Applicable,Solitary,Not Applicable
203,Zebra,220-340,400-900,"Black, White",20-25,Herbivore,Grasslands,"Lions, Hyenas",25,Africa,Least Concern,Equidae,180-365,25,Group-based,5-20


In [93]:
# Exception for wolverine because when you search "wolverine" the superhero
# pops up instead of the animal, name of png was manually changed to just "Wolverine.png"
download_image_faster("Wolverine animal")