In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import clear_output
from urllib.parse import quote

In [2]:
# Define the base URL
base_url = "https://finstat.sk/databaza-financnych-udajov?SalesFrom=0&Sort=sales&page={}&Activity=zdravotn%C3%ADctvo"


In [3]:
# Function to scrape a single page
def scrape_page(page_number):
    url = base_url.format(page_number)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', class_='table table-condensed table-striped data-table-main')
    
    # Extract header
    headers = [header.text.strip() for header in table.find('thead').find_all('th')]
    # Insert new header for href column
    if 'Názov' in headers:
        headers.insert(headers.index('Názov') + 1, 'URL')
    
    # Extract rows
    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all('td')
        row_data = []
        for idx, cell in enumerate(cells):
            if idx == 0:  # First column
                company_link = cell.find('a', href=True)
                company_name = company_link.text.strip()
                company_href = int(company_link['href'][1:])  # Remove first char and convert to int
                badge_div = cell.find('span', class_='badge badge-xs badge-outline-red m-r-xs')
                badge_text = badge_div.text.strip() if badge_div else ''
                row_data.append(company_name + (' ' + badge_text if badge_text else ''))
                row_data.append(company_href)
            else:
                row_data.append(cell.text.strip())
        rows.append(row_data)
    
    return headers, rows

In [4]:
# Initialize list to store all rows
all_rows = []
page_limit = 626

# Iterate over each page
for page in range(1, page_limit + 1):
    headers, rows = scrape_page(page)
    all_rows.extend(rows)
    clear_output(wait=True)
    print(f"Scraped page {page}/{page_limit}")

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_rows, columns=headers)

Scraped page 626/626


In [5]:
# Filter the DataFrame for rows where 'Nazov' contains "zrusena"
df_filter = df[~df['Názov'].str.contains("zrušená", case=False, na=False)]
df_filter = df_filter[df_filter['Názov'].str.contains("s. r. o.|a.s.", case=False, na=False)]
# Filter the data for 'Rok' column (year) for 2021 and younger
df_filter = df_filter[df_filter['Rok'].astype(int) <= 2021]
# Filter out rows with "v likvidácii" or "v konkurze" in the 'Company' column
df_filter = df_filter[~df_filter['Názov'].str.contains("v likvidácii|v konkurze|n.o.", case=False, na=False)]
# Convert 'Tržby' column to numeric, removing ' €' and filtering out values bigger than 5000
df_filter['Tržby'] = df_filter['Tržby'].str.replace('\xa0', '').str.replace(' €', '').str.replace(' ', '').astype(float)
df_filter = df_filter[df_filter['Tržby'] <= 5000]
df_filter

Unnamed: 0,Názov,URL,Rok,Tržby,Zisk,Aktíva,Splatná daň,Zverejnené
19,ELEMENT KOC s. r. o.,52668282,2021,0.0,0 €,5 000 €,-,8.4.2022
151,Neurológia SNV s.r.o.,44198990,2021,0.0,0 €,0 €,-,24.3.2022
167,"INTRAMED, s. r. o.",44968841,2016,0.0,-480 €,10 153 €,480 €,15.7.2017
439,MM DENTAL s. r. o.,45982031,2017,0.0,0 €,122 180 €,-,30.11.2018
461,"TILIA spol. s r.o., Vranov nad Topľou, Hlovíko...",31715231,2020,0.0,,88 723 €,-,3.11.2020
591,ESTEA s.r.o.,36765261,2021,0.0,-1 681 €,75 780 €,-,21.6.2022
780,EU Progress s. r. o.,45274347,2021,0.0,0 €,89 698 €,-,3.4.2022
900,ALKATA s.r.o.,36429724,2018,0.0,,,-,5.7.2019
1009,CrANIUM s. r. o.,46134476,2021,0.0,-25 471 €,40 777 €,-,5.7.2022
1021,Prevencia zdravia s. r. o.,44369239,2020,0.0,0 €,6 634 €,-,3.4.2021


In [6]:
# Function to get "Predmet podnikania (činnosti)" from the website using href as ID
def get_cinnosti(company_name):
    # Prepare the search URL
    # Truncate company name to 35 characters if it exceeds that length
    truncated_name = company_name[:35]
    
    # Encode the truncated company name using iso-8859-1 encoding
    encoded_name = truncated_name.encode('iso-8859-1')
    search_name = quote(encoded_name).replace('%20', '+')
    
    search_url = f"https://www.orsr.sk/hladaj_subjekt.asp?OBMENO={search_name}"
     # Make the request to the search URL
    response = requests.get(search_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the first href link in the div with class 'bmk'
    bmk_div = soup.find('div', class_='bmk')
    if not bmk_div:
        return None
    
    first_link = bmk_div.find('a', href=True)
    if not first_link:
        return None
    
    # Construct the new URL from the href
    url = f"https://www.orsr.sk/{first_link['href']}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table row containing "Predmet podnikania (činnosti)" substring
    def contains_text(tag):
        return tag.name == 'td' and "Predmet podnikania" in tag.text
    
    target_row = soup.find(contains_text)
    if target_row:
        next_td = target_row.find_next_sibling('td')
        if next_td and "Výskum a vývoj" in next_td.text:
            return next_td.text.strip()
    return None

In [9]:
# Add a new column for "cinnosti"
print("Starting to apply get_cinnosti function...")

total_rows = df_filter.shape[0]
j=1
for i, row in df_filter.iterrows():
    company_name = row['Názov']
    print(str(j) +": "+ company_name)
    cinnosti = get_cinnosti(company_name)
    df_filter.at[i, 'cinnosti'] = cinnosti
    clear_output(wait=True)
    print(f"Processed {j}/{total_rows} rows...")
    j+=1



Processed 4/39 rows...
5: TILIA spol. s r.o., Vranov nad Topľou, Hlovíkova 187/1


UnicodeEncodeError: 'latin-1' codec can't encode character '\u013e' in position 34: ordinal not in range(256)

In [None]:
# Remove rows where "cinnosti" is None
df_none = df_filter.dropna(subset=['cinnosti'])
df_none

In [None]:
df_filter

In [None]:
df.to_csv('finstat_data.csv', index=False)

print("Scraping completed and data saved to finstat_data.csv")