_Notes: A recent Google Colab update now requires additional dependencies to be installed for Selenium to work. Simply run the cell below to install them. If you want to get the installation codes for your future projects, click on the Show code button._

In [None]:
#@title RUN THIS CELL TO INSTALL THE NECCESSARY DEPENDENCIES

%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

In [None]:
!pip install selenium

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

In [3]:
import warnings
warnings.simplefilter("ignore")

## **Function to start and close driver**

In [4]:
# Global driver to use throughout the script
DRIVER = None

# Wrapper to close driver if its created
def close_driver():
    global DRIVER
    if DRIVER is not None:
        DRIVER.close()
    DRIVER = None

# Function to (re)start driver
def initialize_driver(force_restart=False):
    global DRIVER

    if force_restart:
        close_driver()

    # Setting up the driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless') # we don't want a chrome browser opens, so it will run in the background
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    DRIVER = webdriver.Chrome(options=options)

In [228]:
initialize_driver(force_restart=True)

In [234]:
DRIVER

<selenium.webdriver.chrome.webdriver.WebDriver (session="6a19f896f98e8a72dae2381a282780af")>

In [225]:
#close_driver()

## **Set up links**

In [232]:
# urls
url = 'https://www.goodmanandsons.com/catalog/?type=bracelet'

* 1/ Function to get information from one product

⚠️ **NOTE:** Sometimes, the web element returned by the driver can be faulty due to the way the website is set up. This can lead to a situation where calling `.text` from that web element returns an empty string, even though there are visible texts inside the element when checked manually using Inspect. When this happens, you can use `.get_attribute('innerHTML')` instead of `.text`.

In [233]:
DRIVER.get(url)
DRIVER.current_url

'https://www.goodmanandsons.com/catalog/?type=bracelet'

In [197]:
all_product = DRIVER.find_elements(By.CLASS_NAME,'item-meta')

In [198]:
len(all_product)

24

In [199]:
product = all_product[1]

In [215]:
def get_one_product(one_product):
  info = {'id':'',
          'name': '',
          'brand': '',
          'price': '',
          'link': ''}

  productt = one_product.find_element(By.CLASS_NAME, 'iactfilt')
  pro_detail = productt.get_attribute('onclick')
  # get brand
  brand = re.search(r'brand.*',pro_detail.split('\n')[7]).group().split(':')[-1].strip("' ,'")
  # get link
  link = productt.get_attribute('href')
  # get name
  name = productt.text
  # id
  id = re.search(r'\d*-\d*',pro_detail.split('\n')[4]).group()
  # price
  price = one_product.find_element(By.CLASS_NAME, 'price').text
  # update info
  info['id'] = id
  info['name'] = name
  info['brand'] = brand
  info['price'] = price
  info['link'] = link

  return info

In [216]:
get_one_product(all_product[1])

{'id': '440-00187',
 'name': 'Bracelet',
 'brand': 'Gabriel & Co',
 'price': '$875',
 'link': 'https://www.goodmanandsons.com/catalog/gabriel-co/440-00187/'}

In [217]:
def get_all_product(webnlink):
  global DRIVER
  DRIVER.get(webnlink)

  all_data = []
  time.sleep(3)

  all_product = DRIVER.find_elements(By.CLASS_NAME,'item-meta')

  for i in all_product:
    try:
      result = get_one_product(i)
      all_data.append(result)

    except Exception:
      print('No!!!')
      continue
  return all_data

In [218]:
needed_info = get_all_product(url)
close_driver()

In [219]:
pd.DataFrame(data = needed_info, columns = needed_info[0].keys())

Unnamed: 0,id,name,brand,price,link
0,170-00431,14KTT 7IN 0.27TDW TWISTED ROPE OVAL LINK CONNE...,Gabriel & Co,"$2,950",https://www.goodmanandsons.com/catalog/gabriel...
1,440-00187,Bracelet,Gabriel & Co,$875,https://www.goodmanandsons.com/catalog/gabriel...
2,330-00053,Bracelet,Gabriel & Co,"$1,225",https://www.goodmanandsons.com/catalog/gabriel...
3,330-00052,Bracelet,Gabriel & Co,"$1,225",https://www.goodmanandsons.com/catalog/gabriel...
4,610-01466,925 Sterling Silver Filigree and Pearl Station...,Gabriel & Co,$280,https://www.goodmanandsons.com/catalog/gabriel...
5,240-00045,Bracelet,Goodman & Sons Signature,"$4,499.95",https://www.goodmanandsons.com/catalog/goodman...
6,170-00433,Bracelet,Goodman & Sons Signature,"$5,999.95",https://www.goodmanandsons.com/catalog/goodman...
7,240-00046,Bracelet,Goodman & Sons Signature,"$4,499.95",https://www.goodmanandsons.com/catalog/goodman...
8,170-00434,Bracelet,Goodman & Sons Signature,"$6,999.95",https://www.goodmanandsons.com/catalog/goodman...
9,170-00432,Bracelet,Goodman & Sons Signature,"$6,499.95",https://www.goodmanandsons.com/catalog/goodman...
