# Imports

In [1]:
import requests
import bs4
import json
import time
from PIL import Image
import io
import pandas as pd
import os

# Extract web content

In [2]:
url = 'https://www.tilbudsugen.dk/partner/netto-114?page=100'

response = requests.get(url)
time.sleep(2)
if response.status_code == 200:
    print("Request was successful.")
else:
    assert False, f"Request failed with status code {response.status_code}"

soup = bs4.BeautifulSoup(response.text, 'html.parser')

Request was successful.


# Get product ids

In [8]:
links = soup.find_all('a', href=True)
product_links = [link['href'] for link in links if link['href'].startswith('https://www.tilbudsugen.dk/single/')]
all_ids = [int(link.split('/')[-1]) for link in product_links]

# Get product info

In [9]:
products = {}
all_ids = all_ids
N = len(all_ids)
for i, data_id in enumerate(all_ids):
    if i % 10 == 0:
        print(f"Processing #{i} of {N}")
    data_url = f"https://www.tilbudsugen.dk/_next/data/0LbXUdvz48Lb0tgkd4pVT/dk/single/{data_id}.json?id={data_id}"
    response = requests.get(data_url)
    time.sleep(1)
    if response.status_code != 200:
        assert False, f"Request failed with status code {response.status_code}"
    page_props = response.json()['pageProps']
    # print(json.dumps(page_props, indent=2))
    
    price        = page_props['offer']['price']
    brand        = page_props['offer']['brand']['name']
    category     = page_props['offer']['productVariant']['category']['name']
    product_name = page_props['offer']['productName']['productName']
    units        = page_props['offer']['units']
    quantity     = int(eval(page_props['offer']['quantity']))
    unit_type    = page_props['offer']['unitType']
    store_name   = page_props['offer']['chain']['name']
    image_url    = page_props['offer']['imageUrl']
    start_date   = page_props['offer']['startDate']
    end_date     = page_props['offer']['endDate']

    products[data_id] = [price, brand, category, product_name, units, quantity, unit_type, store_name, image_url, start_date, end_date]

Processing #0 of 393
Processing #10 of 393
Processing #20 of 393
Processing #30 of 393
Processing #40 of 393
Processing #50 of 393
Processing #60 of 393
Processing #70 of 393
Processing #80 of 393
Processing #90 of 393
Processing #100 of 393
Processing #110 of 393
Processing #120 of 393
Processing #130 of 393
Processing #140 of 393
Processing #150 of 393
Processing #160 of 393
Processing #170 of 393
Processing #180 of 393
Processing #190 of 393
Processing #200 of 393
Processing #210 of 393
Processing #220 of 393
Processing #230 of 393
Processing #240 of 393
Processing #250 of 393
Processing #260 of 393
Processing #270 of 393
Processing #280 of 393
Processing #290 of 393
Processing #300 of 393
Processing #310 of 393
Processing #320 of 393
Processing #330 of 393
Processing #340 of 393
Processing #350 of 393
Processing #360 of 393
Processing #370 of 393
Processing #380 of 393
Processing #390 of 393


# Gather data in table

In [10]:
df_products = pd.DataFrame.from_dict(products, orient='index', columns=['price', 'brand', 'category', 'product_name', 'units', 'quantity', 'unit_type', 'store_name', 'image_url', 'start_date', 'end_date'])
df_products.reset_index(inplace=True)
df_products.rename(columns={'index': 'data_id'}, inplace=True)
date = df_products['start_date'].mode()[0]
df_products.to_csv(f'../data/csv/products_{date}.csv', index=False)

# Download resized images

In [11]:
def download_and_resize_image(image_url, data_id, date):
    extension = image_url.split("?")[0].split(".")[-1].lower()
    os.makedirs(f"../data/imgs/{date}", exist_ok=True)
    filename = f"../data/imgs/{date}/{data_id}.{extension}"

    resp = requests.get(image_url)
    if resp.status_code != 200:
        print("Failed to download image:", resp.status_code)
    else:
        img = Image.open(io.BytesIO(resp.content))
        w, h = img.size
        max_side = max(w, h)
        if max_side > 300:
            scale = 300 / max_side
            new_size = (int(round(w * scale)), int(round(h * scale)))
            try:
                resample = Image.Resampling.LANCZOS
            except AttributeError:
                resample = Image.LANCZOS
            img = img.resize(new_size, resample)
        img.save(filename)

# for all product_ids in df_products, download and resize images
for idx, row in df_products.iterrows():
    download_and_resize_image(row['image_url'], row['data_id'], date)