# Scraping with Python Selenium
**By Jayden Nyamiaka**

In this notebook, we will get started with web scraping using Python's Selenium drivers. Selenium tends to be more robust and flexible than Beautiful Soup. For this exercise, we will scrape Wikipedia and Tineye.

In [1]:
# Imports
import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Setup ChromeDriver
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

In [3]:
# NAVIGATE TO WIKIPEDIA AND SEARCH FOR CALTECH

# Navigate to Wikipedia
wikipedia_url = "https://en.wikipedia.org/wiki/Main_Page"
driver.get(wikipedia_url)

# Toggle the search bar to be visible if it isn't already
try: 
    search_toggle = driver.find_element(By.CLASS_NAME, "search-toggle")
    search_toggle.click()
except: 
    pass

# Get the search bar element (accounting for the refresh)
# Wait until the element is stale and then visible again
search_bar = driver.find_element(By.NAME, "search")
try:
    WebDriverWait(driver, 10).until(EC.staleness_of(search_bar))
    search_bar = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
        (By.NAME, "search")))
except:
    search_bar = driver.find_element(By.NAME, "search")

# Search Caltech in the search bar
search_bar.send_keys("Caltech")

# Click the search button
search_button = driver.find_element(By.CLASS_NAME, "cdx-search-input__end-button")
search_button.click()

In [4]:
# NAVIGATE TO SCIAC PAGE AND SCRAPE CHRONOLOGICAL TIMELINE

# Find and click on the SCIAC sporting affliations link
sciac_link = driver.find_element(By.PARTIAL_LINK_TEXT, "Southern California Intercollegiate Athletic Conference")
sciac_link.click()

# Scrape the Chronological Timeline and make a df with the year and description of every element
timeline_list = driver.find_element(By.XPATH, "/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/ul[1]")
timeline_elems = timeline_list.find_elements(By.TAG_NAME, 'li')

years = []
descriptions = []

for elem in timeline_elems:
    text = elem.text
    years.append(text[:4])
    descriptions.append(text[7:])

df_timeline = pd.DataFrame({"years": years, "description": descriptions})
display(df_timeline)

Unnamed: 0,years,description
0,1915,The Southern California Intercollegiate Athlet...
1,1920,The Southern Branch of the University of Calif...
2,1926,La Verne College (now the University of La Ver...
3,1927,"UCLA left the SCIAC, effective after the 1926-..."
4,1931,Santa Barbara State College (now the Universit...
5,1934,"Caltech and Pomona left the SCIAC, effective a..."
6,1938,"La Verne and UC Santa Barbara left the SCIAC, ..."
7,1938,"Caltech and Pomona re-joined the SCIAC, effect..."
8,1939,"San Diego State left the SCIAC, effective afte..."
9,1943,"Whittier left the SCIAC, effective after the 1..."


In [5]:
# GO BACK TO CALTECH PAGE, SAVE AND REVERSE SEARCH IMAGE OF TOLMAN AND EINSTEIN, 
# AND SCRAPE THE RESULTS

# Go back to Caltech Wikipedia page
driver.back()

# Find and download the image of Tolman and Einstein at Caltech
img = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
    (By.XPATH, '//*[@id="mw-content-text"]/div[1]/figure[5]/a/img')))
img_url = img.get_attribute("src")

img_resp = requests.get(img_url)
img_filename = "Richard C. Tolman and Albert Einstein at Caltech, 1932.jpg"
with open(img_filename, "wb") as f:
    f.write(img_resp.content)

# Navigate to Tineye (for reverse image searching)
tineye_url = "https://tineye.com/"
driver.get(tineye_url)

# Search for the image url in the search box and click the submit button
url_box = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
    (By.ID, "url_box")))
url_box.send_keys(img_url)
submit_button = driver.find_element(By.ID, "url_submit")
submit_button.click()

# Scrape the results and put them into a dataframe
websites = []
urls = []
 
# Loop through all pages of results
are_more_results = True
while (are_more_results):
    # Wait until results have loaded in
    results = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
        (By.CLASS_NAME, "results")
    ))
    # Extract the website and links from each result block
    result_elems = results.find_elements(By.CLASS_NAME, "match")
    for result_elem in result_elems:
        website = result_elem.find_element(By.TAG_NAME, "h4").text
        data_elem = result_elem.find_element(By.TAG_NAME, "p")
        
        links = []
        link_elems = data_elem.find_elements(By.TAG_NAME, "a")
        for link_elem in link_elems:
            link = link_elem.get_attribute("href")
            links.append(link)

        websites.append(website)
        urls.append(", ".join(links)) # Concatenate all links as comma-separated string
    
    # Click the next page button if not on the last page
    row_div = driver.find_element(By.CLASS_NAME, "sorting-row")
    if (row_div.get_attribute("currentpage") == row_div.get_attribute("totalpages")):
        break
    else:
        next_button = driver.find_element(By.CLASS_NAME, "next")
        next_button.click()

df_websites = pd.DataFrame({"website": websites, "urls": urls})
print("Number of websites: " + str(len(websites)))
display(df_websites)


Number of websites: 110


Unnamed: 0,website,urls
0,www.alamy.com,https://www.alamy.com/stock-image-tolman-26-ei...
1,www.yoka.com,http://www.yoka.com/dna/li/c172.html
2,fa.wikipedia.org,https://fa.wikipedia.org/wiki/%D8%B1%DB%8C%DA%...
3,www.yoka.com,http://www.yoka.com/dna/m/t54262
4,wemedia.ifeng.com,http://wemedia.ifeng.com/8815421/wemedia.shtml
...,...,...
105,wildup.la,http://wildup.la/events/past-events/
106,blogs.mediapart.fr,https://blogs.mediapart.fr/edition/la-mendie-b...
107,defenceforumindia.com,https://defenceforumindia.com/threads/indian-s...
108,linkiesta.it,http://linkiesta.it/cultura
