In [46]:
import requests, re, os
from bs4 import BeautifulSoup
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

OUTPUT_DIR = 'output/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [47]:
def scrappe_adresar_psychiatru_list_page(url):
    df = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.text, "html")

    names = soup.find_all('h2')
    infos = soup.find_all('p')
    infos = [info for info in infos if 'kontakt' in info.text.lower()]

    for name_tag, info_tag in zip(names, infos):

        name = name_tag.text
        info = info_tag.text

        # info_span = soup.find_all('span', {'data-name': name})[0]
        # station = info_span['data-addressname']
        # full_address = info_span['data-address']

        # adress_items = full_address.split(' ')
        # address = adress_items[0] + ' ' + adress_items[1]
        # city = " ".join(adress_items[2:])

        station = None
        address = None
        city = None
        zip_code = None
        phone = None
        email = None

        address_lines_cnt = 0
        for idx, line in enumerate(info.split('\n')):

            if "kontakt" in line.lower():
                address_lines_cnt = 1
                continue
            if address_lines_cnt == 1:
                station = line.strip()
                if station.endswith(','):
                    station = station.rstrip(',')
                address_lines_cnt += 1
                continue
            if address_lines_cnt == 2:
                address = line.strip()
                if address.endswith(','):
                    address = address.rstrip(',')
                address_lines_cnt += 1
                continue
            if address_lines_cnt == 3:
                city_line = line.strip().split(' ')
                city = " ".join(city_line[2:])
                zip_code = "".join(c for c in city_line[:2] if c.isdigit())
                address_lines_cnt += 1
                continue
            if "tel.:" in line:
                phone = re.sub(r'[^\d\+]', '', line)
                continue
            if "e-mail:" in line:
                email = line.split(':')[1].strip()
                continue

        df = pd.concat([df, pd.DataFrame([{
            'name': name,
            'station': station,
            'address': address,
            'city': city,
            'zip_code': zip_code,
            'phone': phone,
            'email': email,
            'data_source': 'https://adresar-psychiatru.nudz.cz/'
        }])], ignore_index=True)

    return df

In [48]:
df_psychologove = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])
df_psichiatri = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

for page_number in range(1, 13):
    url = f"https://adresar-psychiatru.nudz.cz/psycholog/seznam?page={page_number}&do=listChange"
    df_psychologove = pd.concat([df_psychologove, scrappe_adresar_psychiatru_list_page(url)], ignore_index=True)

for page_number in range(1, 20):
    url = f"https://adresar-psychiatru.nudz.cz/psychiatr/seznam?page={page_number}&do=listChange"
    df_psichiatri = pd.concat([df_psichiatri, scrappe_adresar_psychiatru_list_page(url)], ignore_index=True)

In [49]:
def scrappe_dusevnizdravi_vzp_db(base_url, list_page_url):

    df = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

    page = requests.get(base_url+list_page_url, verify=False)
    soup = BeautifulSoup(page.text, "html")

    table = soup.find_all('tbody', class_='ContractsTable-body')[0]

    table_rows = table.find_all('tr')

    name = None
    station = None
    address = None
    city = None
    zip_code = None
    phone = None
    email = None

    for table_row in table_rows:
        t_url = table_row.find_all('a')[0]['href']
        t_page = requests.get(base_url+t_url, verify=False)
        t_soup = BeautifulSoup(t_page.text, "html")

        name = t_soup.find_all('h2', class_="Heading Heading--h2")[0].text

        t_table = t_soup.find_all('tbody', class_="ContractsDetailTable-body")[0]
        t_table_rows = t_table.find_all('td')

        for idx, t_table_row in enumerate(t_table_rows):
            if idx == 0:
                station = t_table_row.text.strip()
                continue
            if idx == 1:
                tmp = t_table_row.text.strip().replace('\t', '').replace('\r', '').split('\n')
                address = tmp[0].strip()
                city = " ".join(tmp[1].split(' ')[1:])
                zip_code = tmp[1].split(' ')[0]
                continue
            if idx == 4:
                phone = t_table_row.text.strip()
                continue
            if idx == 5:
                email = t_table_row.text.strip()
                break

        df = pd.concat([df, pd.DataFrame([{
            'name': name,
            'station': station,
            'address': address,
            'city': city,
            'zip_code': zip_code,
            'phone': phone,
            'email': email,
            'data_source': base_url
        }])], ignore_index=True)

    return df


In [50]:
df_vzp = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

base_url = "https://dusevnizdravi.vzp.cz"

for page_number in range(1, 75):
    list_page_url = f"/seznam-terapeutu/?queryKraj=&queryZamereni=&queryForma=&queryKapacita=&queryText=&stranka={page_number}"
    df_vzp = pd.concat([df_vzp, scrappe_dusevnizdravi_vzp_db(base_url, list_page_url)], ignore_index=True)

In [51]:
def scrappe_hledampsychologa_db(base_url, list_page_url):

    df = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

    page = requests.get(base_url+list_page_url, verify=False)
    soup = BeautifulSoup(page.text, "html")

    results = soup.find_all('div', class_='vysledek')

    name = None
    station = None
    address = None
    city = None
    zip_code = None
    phone = None
    email = None
    link = None

    for result in results:
        t_url = result.find_all('a')[0]['href']
        t_page = requests.get(base_url+t_url, verify=False)
        t_soup = BeautifulSoup(t_page.text, "html")

        name = t_soup.find_all('h2', class_="profil__name")[0].text

        items = t_soup.find_all('div', class_="item")
        for item in items:
            item_imgs = item.find_all('img')
            if len(item_imgs) == 0:
                continue

            item_img_src = item_imgs[0]['src']
            if 'tel.svg' in item_img_src:
                phone = item.find_all('p')[0].text.strip()
            if 'home.svg' in item_img_src:
                full_address = item.find_all('p')[0].text.strip()
                address_parts = full_address.split(',')

                if len(address_parts) > 1:
                    address = address_parts[0].strip()
                    address = address if not '(ulice) (popisné)' in address else None
                    city = address_parts[1].strip()
                else:
                    city = address_parts[0].strip()
            if 'link.svg' in item_img_src:
                link = item.find_all('a')[0].text.strip()
   
        df = pd.concat([df, pd.DataFrame([{
            'name': name,
            'station': station,
            'address': address,
            'city': city,
            'zip_code': zip_code,
            'phone': phone,
            'email': email,
            'data_source': base_url
        }])], ignore_index=True)

    return df


In [52]:
df_hledampsychologa = pd.DataFrame(columns=['name', 'station', 'address', 'city', 'zip_code', 'phone', 'email', 'data_source'])

for fieldId in range(1, 18):

    base_url = f"https://hledampsychologa.cz"
    list_page_url = f"/search-result/?fieldId={fieldId}1&regionId=0"

    df_hledampsychologa = pd.concat([df_hledampsychologa, scrappe_hledampsychologa_db(base_url, list_page_url)], ignore_index=True)

In [53]:
df_all_psychologove = pd.concat([df_psychologove, df_vzp, df_hledampsychologa], ignore_index=True)
df_all_psichiatri = pd.concat([df_psichiatri], ignore_index=True)

df_all_psychologove.to_csv(OUTPUT_DIR + 'psychologove.csv', index=False, sep=';')
df_all_psichiatri.to_csv(OUTPUT_DIR + 'psichiatri.csv', index=False, sep=';')