# Scraping Digitec Time Series Data

In [1]:
import pandas as pd
import numpy as np
from plotnine import *
from mizani.formatters import comma_format
import re
import time

from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumwire.utils import decode
from selenium.webdriver.common.keys import Keys
import json
from tqdm.notebook import tqdm

from selenium.webdriver.chrome.service import Service

%load_ext blackcellmagic

## Get Smartphone URLs

In [None]:
driver = webdriver.Chrome()
driver.maximize_window()

In [None]:
driver.get("https://www.digitec.ch/de/s1/producttype/smartphone-24?filter=t_off%3DInStock%7CSale%7CRetail")

127.0.0.1:62736: Traceback (most recent call last):
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\server\server.py", line 113, in handle
    root_layer()
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\server\modes\http_proxy.py", line 9, in __call__
    layer()
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\server\protocol\tls.py", line 285, in __call__
    layer()
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\server\protocol\http1.py", line 100, in __call__
    layer()
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\server\protocol\http.py", line 206, in __call__
    if not self._process_flow(flow):
  File "C:\Users\mathi\AppData\Roaming\Python\Python310\site-packages\seleniumwire\thirdparty\mitmproxy\se

In [None]:
available_devices = pd.to_numeric(re.sub(r'[^0-9.-]', '', driver.find_element(By.XPATH, '//*[@id="productListingContainer"]/div[1]/h2').text[0:5]))
available_devices

1957

Start by clicking "show more" until end of page is reached.

Then scroll down to the bottom of the page and then start scrolling up slowly while iteratively updating product ID list and dropping duplicates:

In [None]:
while True:
    try:
        driver.find_element(By.XPATH, "//*/button[text()='Mehr anzeigen']").click()
        np.random.uniform(low = 0.5, high = 1.5)
    except:
        break

scroll_increment_size = 200
article_numbers = []

driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
page_height = driver.execute_script("return document.body.scrollHeight")
times_to_scroll = int(np.floor(page_height/scroll_increment_size))

for scroll in np.arange(1, times_to_scroll): 
    # scroll up and wait to load
    driver.execute_script(f"window.scrollTo(0, {page_height - scroll*scroll_increment_size})")
    time.sleep(1)

    # Read in the product IDs and drop duplicates along the way
    article_numbers.extend([i.get_attribute("href").rsplit("-", 1)[1]  for i in driver.find_elements(By.XPATH, '//*[@id="productListingContainer"]/div[4]/article/a')])
    article_numbers = list(dict.fromkeys(article_numbers))
    print(f"{len(article_numbers)} of {available_devices} logged")

25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
25 of 1957 logged
26 of 1957 logged
26 of 1957 logged
29 of 1957 logged
29 of 1957 logged
29 of 1957 logged
32 of 1957 logged
32 of 1957 logged
35 of 1957 logged
35 of 1957 logged
38 of 1957 logged
38 of 1957 logged
41 of 1957 logged
41 of 1957 logged
41 of 1957 logged
44 of 1957 logged
44 of 1957 logged
47 of 1957 logged
47 of 1957 logged
50 of 1957 logged
50 of 1957 logged
50 of 1957 logged
53 of 1957 logged
53 of 1957 logged
56 of 1957 logged
56 of 1957 logged
59 of 1957 logged
59 of 1957 logged
62 of 1957 logged
62 of 1957 logged
62 of 1957 logged
65 of 1957 logged
65 of 1957 logged
68 of 1957 logged
68 of 1957 logged
71 of 1957 logged
71 of 1957 logged
71 of 1957 logged
74 of 1957 logged
74 of 1957 logged
77 of 1957 logged
77 of 1957 logged
80 of 1957 logged
80 of 1957 logged
83 of 1957

In [None]:
pd.DataFrame({"product_id": article_numbers}).to_csv("product_ids.csv", index=False)

## Looping Through Item URLs

- Read in list of product IDs and loop through them
    - Read in general device data
    - Read in price development

Need to remove the ones that contain "shid":

In [8]:
article_numbers = pd.read_csv("product_ids.csv").assign(shid = lambda x: x.product_id.apply(lambda r: "shid" not in r)).query("shid").reset_index(drop=True).drop("shid", axis=1)["product_id"].tolist()
article_numbers.sort()
print(f"First five product IDs: {article_numbers[:5]}")
len(article_numbers)

First five product IDs: ['10265229', '10274134', '10421653', '10676219', '11359513?supplier=406802']


1887

### Scrape the actual information in a loop:

In [None]:
# Read in previous progress
try:
    devices_specs = pd.read_csv("device_specs.csv")
except OSError: 
    devices_specs = None

try:
    price_history = pd.read_csv("price_history.csv")
except OSError:
    price_history = None

# Check article numbers that have already been scraped
if devices_specs is not None:
    scraped_already = devices_specs.product_id.astype(str).tolist()
else: 
    scraped_already = []

# Get all articles and filter out the ones that have been scraped already
article_numbers = pd.read_csv("product_ids.csv").assign(shid = lambda x: x.product_id.apply(lambda r: "shid" not in r)).query("shid").reset_index(drop=True).drop("shid", axis=1)["product_id"].tolist()
article_numbers.sort()
article_numbers = [id for id in article_numbers if id not in scraped_already]


for idx, product_id in enumerate(tqdm(article_numbers)):
    
    # ---------- GET ITEM INFORMATION ----------
    # Open up driver
    driver = webdriver.Chrome()
    driver.maximize_window()
    
    # Clear cache
    driver.get('chrome://settings/clearBrowserData')
    driver.find_element(By.XPATH, '//settings-ui').send_keys(Keys.ENTER)

    # Navigate to product
    try:
        driver.get(f"https://www.digitec.ch/{product_id}")
    except:
        continue
    time.sleep(np.random.uniform(low=6, high=8))

    # Get item header
    object_title = driver.find_element(
        By.CSS_SELECTOR, '#pageContent > div > div.sc-c3y39x-0.gCRoGO > div.sc-dz9g2r-0.cMwrl > div > div.sc-18ppxou-0.irUZoQ > div > div.sc-18ppxou-2.hfPLwm > h1'
    ).text

    # Scroll down to additional information and wait
    scroll_level = 100
    while scroll_level < 1200:
        driver.execute_script(f"window.scrollTo(0, {scroll_level})")
        time.sleep(np.random.uniform(low=0, high=0.1))
        scroll_level += 100
    time.sleep(np.random.uniform(low=1, high=2))
    
    # Open up additional information and wait
    try:
        show_more_button = driver.find_element(By.CSS_SELECTOR, '#\:R47amekl6d1i6\: > button')
        driver.execute_script("arguments[0].scrollIntoView({'block':'center','inline':'center'})", show_more_button)
        show_more_button.click()
    except:
        try:
            show_more_button = driver.find_element(By.CSS_SELECTOR, '#\:r2\: > button')
            driver.execute_script("arguments[0].scrollIntoView({'block':'center','inline':'center'})", show_more_button)
            show_more_button.click()
        except:
            try:
                show_more_button = driver.find_element(By.CSS_SELECTOR, '#\:r1\: > button').click()
                driver.execute_script("arguments[0].scrollIntoView({'block':'center','inline':'center'})", show_more_button)
                show_more_button.click()
            except:
                pass
    time.sleep(np.random.uniform(low=1, high=2))
    
    # Get the information into dictionary format
    tbody = driver.find_elements(By.TAG_NAME, "tbody")
    specs_dict = {"Titel": object_title}
    for table in tbody:
        rows = table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) == 2:
                specs_dict[cells[0].text] = cells[1].text

    # Convert the dictionary to a dataframe with the product ID and merge with previous
    if idx == 0 and devices_specs is None:
        devices_specs = pd.DataFrame(
            specs_dict.items(), columns=["Header", "Content"]
        ).assign(product_id=product_id)
    else:
        devices_specs = pd.concat(
            [
                devices_specs,
                pd.DataFrame(specs_dict.items(), columns=["Header", "Content"]).assign(
                    product_id=product_id
                ),
            ],
            axis=0,
        )

    # Write interim results to csv
    devices_specs.to_csv("device_specs.csv", index=False)

    # ---------- GET PRICE HISTORY ----------

    # Scroll further up from bottom until price block is found and wait
    scroll_level = 100
    scroll_timeout = time.time() + 30   # 30 seconds
    while time.time() < scroll_timeout:
        try:
            driver.find_element(By.ID, "priceHistoryBlock").click()
            driver.execute_script("arguments[0].scrollIntoView({'block':'center','inline':'center'})", driver.find_element(By.ID, "priceHistoryBlock"))
            break
        except:
            driver.execute_script(f"window.scrollTo(0, {scroll_level})")
            scroll_level += 100
    time.sleep(np.random.uniform(low=5, high=10))

    # Open up "all time", wait until loaded (takes longer here)
    try:
        driver.find_element(
            By.XPATH, '//*[@id=":R55amekl6d1i6:"]/div/ul/li[2]/button'
        ).click()
    except:
        pass
    time.sleep(np.random.uniform(low=2, high=3))
  
    try:
        # Loop through requests and get the price history
        price_history_request = None
        for request in driver.requests:
            if "pdp-price-history" in request.url:
                price_history_request = request
                break

        price_history_body = decode(
            price_history_request.response.body,
            price_history_request.response.headers.get("Content-Encoding", "identity"),
        )
        json_data = price_history_body.decode("utf-8")
        parsed_data = json.loads(json_data)
        points = parsed_data[0]["data"]["priceHistory"]["points"]

        if idx == 0 and price_history is None:
            price_history = (
                pd.DataFrame(points)
                .drop(["type", "__typename"], axis=1)
                .assign(product_id=product_id)
                .dropna()
            )
        else:
            price_history = pd.concat(
                [
                    price_history,
                    pd.DataFrame(points)
                    .drop(["type", "__typename"], axis=1)
                    .assign(product_id=product_id)
                    .dropna(),
                ],
                axis=0,
            )

        # Write price history to csv for interim results
        price_history.to_csv("price_history.csv", index=False)
    except:
        driver.close()
        continue

    # close driver
    driver.close()

### Read in the Raw Data

In [2]:
specs = pd.read_csv("device_specs.csv").drop_duplicates(["Header", "product_id"])
specs

Unnamed: 0,Header,Content,product_id
0,Titel,Honor View 20,10265229
1,Arbeitsspeicher,6 GB,10265229
2,Speicherkapazität,128 GB,10265229
3,Bildschirmdiagonale (Zoll),"6.40""",10265229
4,Pixelauflösung,2310 x 1080 Pixels,10265229
...,...,...,...
101556,Lieferumfang,"Kurzanleitung, Ladegerät, USB Kabel",9634216
101557,Höhe,4.50 cm,9634216
101558,Breite,10 cm,9634216
101559,Länge,19 cm,9634216


In [3]:
specs = specs.pivot(index="product_id", columns="Header", values="Content").reset_index().rename_axis(None, axis=1)

In [7]:
specs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Columns: 104 entries, product_id to Zustand
dtypes: object(104)
memory usage: 1.5+ MB


In [40]:
prices = pd.read_csv("price_history.csv")
prices

Unnamed: 0,amountIncl,amountExcl,validFrom,product_id
0,429.0,398.33,2022-08-09T12:00:00Z,10265229
1,429.0,398.33,2022-08-10T12:00:00Z,10265229
2,427.0,396.47,2022-08-11T12:00:00Z,10265229
3,427.0,396.47,2022-08-12T12:00:00Z,10265229
4,422.0,391.83,2022-08-16T12:00:00Z,10265229
...,...,...,...,...
147933,465.0,431.75,2023-08-08T12:00:00Z,9581555
147934,653.0,606.31,2023-08-13T12:00:00Z,9581555
147935,461.0,428.04,2023-08-25T12:00:00Z,9581555
147936,337.0,312.91,2023-07-19T12:00:00Z,9634216
