# Imports

In [11]:
import requests
import bs4
import json
import time
from PIL import Image
import io
import pandas as pd
import os

from dotenv import load_dotenv
load_dotenv()

True

# Extract web content

In [2]:
url = 'https://www.tilbudsugen.dk/partner/netto-114?page=100'

response = requests.get(url)
time.sleep(2)
if response.status_code == 200:
    print("Request was successful.")
else:
    assert False, f"Request failed with status code {response.status_code}"

soup = bs4.BeautifulSoup(response.text, 'html.parser')

Request was successful.


# Get product ids

In [3]:
links = soup.find_all('a', href=True)
product_links = [link['href'] for link in links if link['href'].startswith('https://www.tilbudsugen.dk/single/')]
all_ids = [int(link.split('/')[-1]) for link in product_links]

# Get product info

In [4]:
products = {}
all_ids = all_ids
N = len(all_ids)
for i, data_id in enumerate(all_ids):
    if i % 10 == 0:
        print(f"Processing #{i} of {N}")
    data_url = f"https://www.tilbudsugen.dk/_next/data/0LbXUdvz48Lb0tgkd4pVT/dk/single/{data_id}.json?id={data_id}"
    response = requests.get(data_url)
    time.sleep(1)
    if response.status_code != 200:
        assert False, f"Request failed with status code {response.status_code}"
    page_props = response.json()['pageProps']
    # print(json.dumps(page_props, indent=2))
    
    price        = page_props['offer']['price']
    brand        = page_props['offer']['brand']['name']
    category     = page_props['offer']['productVariant']['category']['name']
    product_name = page_props['offer']['productName']['productName']
    units        = page_props['offer']['units']
    quantity     = int(eval(page_props['offer']['quantity']))
    unit_type    = page_props['offer']['unitType']
    store_name   = page_props['offer']['chain']['name']
    image_url    = page_props['offer']['imageUrl']
    start_date   = page_props['offer']['startDate']
    end_date     = page_props['offer']['endDate']

    products[data_id] = [price, brand, category, product_name, units, quantity, unit_type, store_name, image_url, start_date, end_date]

Processing #0 of 477
Processing #10 of 477
Processing #20 of 477
Processing #30 of 477
Processing #40 of 477
Processing #50 of 477
Processing #60 of 477
Processing #70 of 477
Processing #80 of 477
Processing #90 of 477
Processing #100 of 477
Processing #110 of 477
Processing #120 of 477
Processing #130 of 477
Processing #140 of 477
Processing #150 of 477
Processing #160 of 477
Processing #170 of 477
Processing #180 of 477
Processing #190 of 477
Processing #200 of 477
Processing #210 of 477
Processing #220 of 477
Processing #230 of 477
Processing #240 of 477
Processing #250 of 477
Processing #260 of 477
Processing #270 of 477
Processing #280 of 477
Processing #290 of 477
Processing #300 of 477
Processing #310 of 477
Processing #320 of 477
Processing #330 of 477
Processing #340 of 477
Processing #350 of 477
Processing #360 of 477
Processing #370 of 477
Processing #380 of 477
Processing #390 of 477
Processing #400 of 477
Processing #410 of 477
Processing #420 of 477
Processing #430 of 477

# Gather data in table

In [5]:
df_products = pd.DataFrame.from_dict(products, orient='index', columns=['price', 'brand', 'category', 'product_name', 'units', 'quantity', 'unit_type', 'store_name', 'image_url', 'start_date', 'end_date'])
df_products.reset_index(inplace=True)
df_products.rename(columns={'index': 'data_id'}, inplace=True)
date = df_products['start_date'].mode()[0]
df_products.to_csv(f'../data/csv/products_{date}.csv', index=False)

# Download resized images

In [6]:
def download_and_resize_image(image_url, data_id, date):
    extension = image_url.split("?")[0].split(".")[-1].lower()
    os.makedirs(f"../data/imgs/{date}", exist_ok=True)
    filename = f"../data/imgs/{date}/{data_id}.{extension}"

    resp = requests.get(image_url)
    if resp.status_code != 200:
        print("Failed to download image:", resp.status_code)
    else:
        img = Image.open(io.BytesIO(resp.content))
        w, h = img.size
        max_side = max(w, h)
        if max_side > 300:
            scale = 300 / max_side
            new_size = (int(round(w * scale)), int(round(h * scale)))
            try:
                resample = Image.Resampling.LANCZOS
            except AttributeError:
                resample = Image.LANCZOS
            img = img.resize(new_size, resample)
        img.save(filename)

# for all product_ids in df_products, download and resize images
for idx, row in df_products.iterrows():
    download_and_resize_image(row['image_url'], row['data_id'], date)

# Cloudinary image hosting service

In [7]:
!pip install cloudinary



In [8]:
import cloudinary
import cloudinary.uploader
from cloudinary.utils import cloudinary_url

In [12]:
# Configuration       

API_SECRET = os.getenv("CLOUDINARY_KEY")

cloudinary.config( 
    cloud_name = "dfqzmnlga", 
    api_key    = "733583949868714", 
    api_secret = API_SECRET,
    secure=True
)

<cloudinary.Config at 0x729480cb0940>

In [16]:
# Upload an image
upload_result = cloudinary.uploader.upload("https://res.cloudinary.com/demo/image/upload/getting-started/shoes.jpg", public_id="shoes")
print(upload_result["secure_url"])

https://res.cloudinary.com/dfqzmnlga/image/upload/v1761556503/shoes.jpg


In [None]:
# Upload an image
upload_result = cloudinary.uploader.upload("../data/imgs/2025-10-25/10738668.jpg", public_id=f"10738668", asset_folder=date)
print(upload_result["secure_url"])

https://res.cloudinary.com/dfqzmnlga/image/upload/v1761557466/2025-10-25/10738668.jpg


In [27]:
# print size of image in bytes (folder = data/imgs/2025-10-25/10738667.jpg)
import os
file_path = "../data/imgs/2025-10-25/10738667.jpg"
file_size = os.path.getsize(file_path)
print(f"File size: {file_size / 1024} KB")

File size: 13.9326171875 KB


In [28]:
# print size of entire folder
import os
folder_path = "../data/imgs/2025-10-25/"
total_size = 0
for dirpath, dirnames, filenames in os.walk(folder_path):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
print(f"Total folder size: {total_size / (1024 * 1024)} MB")

Total folder size: 7.014111518859863 MB
