# SUKL

URL: [https://www.sukl.sk/hlavna-stranka/slovenska-verzia/databazy-a-servis/vyhladavanie-liekov-zdravotnickych-pomocok-a-zmien-v-liekovej-databaze/vyhladavanie-v-databaze-registrovanych-liekov](https://www.sukl.sk/hlavna-stranka/slovenska-verzia/databazy-a-servis/vyhladavanie-liekov-zdravotnickych-pomocok-a-zmien-v-liekovej-databaze/vyhladavanie-v-databaze-registrovanych-liekov)

In [31]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

import base64
import glob
import hashlib
import io
import json
import os
import pandas as pd
import re
import requests
import tarfile
import zipfile

from bs4 import BeautifulSoup
from IPython.display import IFrame, HTML, JSON
from tqdm.notebook import trange, tqdm
from urllib.parse import parse_qs, urlparse, unquote, urljoin

In [2]:
def get_url(page=None):
    return 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/databazy-a-servis/vyhladavanie-liekov-zdravotnickych-pomocok-a-zmien-v-liekovej-databaze/vyhladavanie-v-databaze-registrovanych-liekov?page_id=242&lie_nazov=&atc_nazov=&lie_kod=&atc_kod=&lie_rc=&drz_kod=' + ('&page={page}' if page else '')

In [3]:
# initial page
page = 1
url = get_url()
logging.info(url)
resp = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(resp.text, 'html.parser')

2025-07-01 06:37:13,829 - INFO - https://www.sukl.sk/hlavna-stranka/slovenska-verzia/databazy-a-servis/vyhladavanie-liekov-zdravotnickych-pomocok-a-zmien-v-liekovej-databaze/vyhladavanie-v-databaze-registrovanych-liekov?page_id=242&lie_nazov=&atc_nazov=&lie_kod=&atc_kod=&lie_rc=&drz_kod=


In [4]:
base_url = 'https://www.sukl.sk'
pages = soup.select("li.page-item.page-item-number > a")
last_page_url = urljoin(base_url, pages[-1]['href'])
last_page_url

'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/databazy-a-servis/vyhladavanie-liekov-zdravotnickych-pomocok-a-zmien-v-liekovej-databaze/vyhladavanie-v-databaze-registrovanych-liekov?page_id=242&lie_nazov=&atc_nazov=&lie_kod=&atc_kod=&lie_rc=&drz_kod=&page=3580'

In [23]:
last_page = int(parse_qs(urlparse(last_page_url).query)['page'][0])
last_page

3580

In [30]:
dir_name = os.path.join('.', 'sukl')
os.makedirs(dir_name, exist_ok=True)

for page in tqdm(range(0, last_page)):
    url = get_url(page=page)
    resp = requests.get(url)
    filename = os.path.join(
        dir_name,
        f'sukl_page{page:06d}.html'
    )
    with open(filename, mode='w') as f:
        f.write(resp.text)

  0%|          | 0/3580 [00:00<?, ?it/s]

In [40]:
html_files = glob.glob('*.html', root_dir=dir_name)
pbar = tqdm(html_files)
drug_pages = []
for html_file in pbar:
    with open(os.path.join(dir_name, html_file), mode='r') as f:
        html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.select('span.result-cell__value > a')
        for link in links:
            url = link['href']
            drug_pages.append(url)
    pbar.set_description(f'{html_file} ({len(drug_pages)})')

  0%|          | 0/3580 [00:00<?, ?it/s]

In [41]:
drug_pages[:25]

['https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=66064',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=66065',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=3837E',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=1404D',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=9794B',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=8009D',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=8008D',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=8986E',
 'https://www.sukl.sk/hlavna-stranka/slovenska-verzia/pomocne-stranky/detail-lieku?page_id=386&lie_id=6746E',
 'https://

In [None]:
dir_name = os.path.join('.', 'sukl_drugs')
os.makedirs(dir_name, exist_ok=True)

pbar = tqdm(drug_pages)
for url in pbar:
    pbar.set_description(f'{url}')
    drug_id = parse_qs(url)['lie_id'][0]
    resp = requests.get(url)
    html = resp.text
    filename = os.path.join(
        dir_name,
        f'sukl_drug_{drug_id}.html'
    )
    with open(filename, mode='w') as f:
        f.write(html)

  0%|          | 0/53700 [00:00<?, ?it/s]

In [None]:
dir_name = os.path.join('.', 'sukl_drugs')
html_files = glob.glob('*.html', root_dir=dir_name)
pbar = tqdm(html_files)
drugs = pd.DataFrame()
for html_file in pbar:
    with open(os.path.join(dir_name, html_file), mode='r') as f:
        pbar.set_description(f'{html_file}')
        soup = BeautifulSoup(html, 'html.parser')
        td_labels = soup.select('td.drug-detail__detail-label')
        drug_metadata = {}
        for td_label in td_labels:
            label = td_label.text.strip().strip(':')
            td_value = td_label.find_next_sibling('td')
            value = td_value.text.strip()
            drug_metadata = drug_metadata | { label: value }
        display(JSON(drug_metadata))
    break