In [1]:
from src.utils.coordinates_util import get_coordinates_for_city, haversine_distance
from src.scrapers.scrapers import BasicScraper
from src.types.types import Coordinates, URL, CSSSelector
from typing import List


import folium
import pandas as pd

In [2]:
def assert_gps_coordinates_equal(
    coord1: Coordinates, coord2: Coordinates, tolerance: float = 1e-2
):
    assert (
        abs(coord1.latitude - coord2.latitude) < tolerance
    ), f"Latitude difference exceeds tolerance: {coord1.latitude} vs {coord2.latitude}"
    assert (
        abs(coord1.longitude - coord2.longitude) < tolerance
    ), f"Longitude difference exceeds tolerance: {coord1.longitude} vs {coord2.longitude}"


expected_coordinates: Coordinates = Coordinates(48.89452, 18.04436)
actual_coordinates: Coordinates = get_coordinates_for_city("Trenčín")

assert_gps_coordinates_equal(actual_coordinates, expected_coordinates)

In [3]:
df: pd.DataFrame = pd.read_csv("data/obyvatelia_na_obec.csv")

assert df.shape[0] == 2927, "Number of municipalities doesn't match"
df.head(7)

Unnamed: 0,Kód,Územná jednotka,Spolu,muži (abs.),muži (%),ženy (abs.),ženy (%)
0,SK0101528595,Bratislava - mestská časť Staré Mesto,46080,22210,48.2,23870,51.8
1,SK0102529311,Bratislava - mestská časť Podunajské Biskupice,23464,11193,47.7,12271,52.3
2,SK0102529320,Bratislava - mestská časť Ružinov,81004,37575,46.39,43429,53.61
3,SK0102529338,Bratislava - mestská časť Vrakuňa,20711,9945,48.02,10766,51.98
4,SK0103529346,Bratislava - mestská časť Nové Mesto,44458,20981,47.19,23477,52.81
5,SK0103529354,Bratislava - mestská časť Rača,25733,12717,49.42,13016,50.58
6,SK0103529362,Bratislava - mestská časť Vajnory,6079,3056,50.27,3023,49.73


In [4]:
selected_columns = ["Územná jednotka", "Spolu"]
df = df[selected_columns]
df = df.rename(columns={"Územná jednotka": "City", "Spolu": "Population"})

In [5]:
df.head(7)

Unnamed: 0,City,Population
0,Bratislava - mestská časť Staré Mesto,46080
1,Bratislava - mestská časť Podunajské Biskupice,23464
2,Bratislava - mestská časť Ružinov,81004
3,Bratislava - mestská časť Vrakuňa,20711
4,Bratislava - mestská časť Nové Mesto,44458
5,Bratislava - mestská časť Rača,25733
6,Bratislava - mestská časť Vajnory,6079


In [6]:
assert df.isna().sum().sum() == 0, "data contains null values"

In [7]:
bratislava_population = df[df["City"].str.startswith("Bratislava")]["Population"].sum()
kosice_population = df[df["City"].str.startswith("Košice")]["Population"].sum()

df_grouped = pd.DataFrame(
    {
        "City": ["Bratislava", "Košice"],
        "Population": [bratislava_population, kosice_population],
    }
)

df_grouped

Unnamed: 0,City,Population
0,Bratislava,475503
1,Košice,229040


In [8]:
df_dummy = df[
    ~df["City"].str.startswith("Bratislava") & ~df["City"].str.startswith("Košice")
]

df_cities = pd.concat([df_dummy, df_grouped], ignore_index=True)

df_cities.sort_values("Population", ascending=False).head()

Unnamed: 0,City,Population
2888,Bratislava,475503
2889,Košice,229040
2060,Prešov,84824
1215,Žilina,82656
729,Nitra,78489


In [9]:
martinus_website: URL = "https://www.martinus.sk/knihkupectva"

martinus_cities_path: CSSSelector = "div.card__content.mt-none h3"
martinus_streets_path: CSSSelector = "div.card__content.mt-none p:first-of-type"

martinus_map = BasicScraper(martinus_website)

martinus_cities: List[str] = martinus_map.scrape(martinus_cities_path)

martinus_cities_clean: List[str] = [
    city.split("-")[0].strip() if "-" in city else city.split("(")[0].strip()
    for city in martinus_cities
]

martinus_streets: List[str] = martinus_map.scrape(martinus_streets_path)

martinus_streets_clean: List[str] = [
    street.split(",")[1].strip() if "," in street else street.strip()
    for street in martinus_streets
]

assert len(martinus_cities_clean) == len(
    martinus_streets_clean
), "Number of streets does not match number of cities"

assert (
    len(martinus_streets_clean) == 31
), "Scraped number of shops does not match the real number as of 7.1.2024"

In [10]:
martinus_adresses = [
    (city, street)
    for street, city in zip(martinus_streets_clean, martinus_cities_clean)
]

for adress in martinus_adresses:
    print(adress)

('Bratislava', 'Obchodná 26')
('Bratislava', 'Staré Grunty 24')
('Bratislava', 'Autobusová stanica Mlynské Nivy')
('Banská Bystrica', 'Ul. 29 augusta 37')
('Lučenec', 'Námestie republiky 5994/32')
('Zvolen', 'OC Klokan - Obchodná 10325/21')
('Banská Štiavnica', 'Radničné námestie 19')
('Revúca', 'Muránska ulica 1337/16')
('Rimavská Sobota', 'Francisciho 2')
('Košice', 'Toryská 5')
('Košice', 'Hlavná 111')
('Trebišov', 'M. R. Štefánika 2329')
('Levice', 'Ľ. Štúra 1A')
('Nitra', 'Akademická 1/A')
('Nitra', 'Napervillská 4837/5')
('Poprad', 'Námestie svätého Egídia 3290/124')
('Prešov', 'Námestie legionárov 1')
('Humenné', 'Nám. slobody 62')
('Kežmarok', 'Hviezdoslavova 10')
('Sabinov', 'Námestie slobody 32')
('Svidník', 'Sovietskych hrdinov 165/62')
('Považská Bystrica', 'Centrum 8')
('Ilava', 'Moyzesova 1871/123')
('Myjava', 'M.R.Štefánika 929')
('Nové Mesto nad Váhom', 'Weisseho 15/2615')
('Trnava', 'Dolné Bašty 8833/14')
('Liptovský Mikuláš', 'Kamenné pole 4449/3')
('Martin', 'Námesti

In [11]:
df_cities_w_Martinus = pd.DataFrame(martinus_adresses)

df_cities_w_Martinus.columns = ["City", "Street"]
df_cities_w_Martinus.head()

Unnamed: 0,City,Street
0,Bratislava,Obchodná 26
1,Bratislava,Staré Grunty 24
2,Bratislava,Autobusová stanica Mlynské Nivy
3,Banská Bystrica,Ul. 29 augusta 37
4,Lučenec,Námestie republiky 5994/32


In [12]:
def get_coordinates_for_dataframe(df):
    def get_coordinates_for_address(row):
        address = f"{row['Street']} {row['City']}"
        return get_coordinates_for_city(address)

    df["Coordinates"] = df.apply(get_coordinates_for_address, axis=1)
    missing_coordinates = df["Coordinates"].isna().sum().sum()

    return df, missing_coordinates

In [13]:
df_cities_w_Martinus, missing_coordinates = get_coordinates_for_dataframe(
    df_cities_w_Martinus
)

print(f"Number of missing coordinates: {missing_coordinates}")

df_cities_w_Martinus.head()

Coordinates not found for OC Klokan - Obchodná 10325/21 Zvolen
Coordinates not found for Stred 157- Srdce Turzovky Turzovka
Number of missing coordinates: 2


Unnamed: 0,City,Street,Coordinates
0,Bratislava,Obchodná 26,"(48.14718, 17.11001)"
1,Bratislava,Staré Grunty 24,"(48.1581155, 17.06974372915665)"
2,Bratislava,Autobusová stanica Mlynské Nivy,"(48.1468327, 17.128482)"
3,Banská Bystrica,Ul. 29 augusta 37,"(48.7363849, 19.1632147)"
4,Lučenec,Námestie republiky 5994/32,"(48.3307838, 19.6611366)"


In [14]:
df_cities_w_Martinus["Street"] = df_cities_w_Martinus["Street"].str.replace(
    r"\bOC\sKlokan\s-\s", "", regex=True
)

df_cities_w_Martinus["Street"] = df_cities_w_Martinus["Street"].str.replace(
    r"-\sSrdce Turzovky", "", regex=True
)

In [15]:
df_cities_w_Martinus, missing_coordinates = get_coordinates_for_dataframe(
    df_cities_w_Martinus
)

print(f"Number of missing coordinates: {missing_coordinates}")

df_cities_w_Martinus.head()

Number of missing coordinates: 0


Unnamed: 0,City,Street,Coordinates
0,Bratislava,Obchodná 26,"(48.14718, 17.11001)"
1,Bratislava,Staré Grunty 24,"(48.1581155, 17.06974372915665)"
2,Bratislava,Autobusová stanica Mlynské Nivy,"(48.1468327, 17.128482)"
3,Banská Bystrica,Ul. 29 augusta 37,"(48.7363849, 19.1632147)"
4,Lučenec,Námestie republiky 5994/32,"(48.3307838, 19.6611366)"


In [16]:
def add_marker_to_map(coordinates_list, color, map_object):
    for coordinates in coordinates_list:
        try:
            lat, lon = coordinates[0], coordinates[1]

            markers = folium.Marker(
                location=[lat, lon],
                icon=folium.Icon(color=color),
            )
            map_object.add_child(markers)
        except Exception as e:
            raise ValueError(f"Error adding marker: {e}") from e

In [17]:
martinus_map = folium.Map(location=[48, 20], zoom_start=8)

add_marker_to_map(df_cities_w_Martinus["Coordinates"], "red", martinus_map)

# martinus_map

![Slovakia Map](data/martinus.png)

In [18]:
df_cities_w_Martinus = pd.merge(df_cities_w_Martinus, df_cities, on="City", how="left")

df_cities_w_Martinus.head()

Unnamed: 0,City,Street,Coordinates,Population
0,Bratislava,Obchodná 26,"(48.14718, 17.11001)",475503
1,Bratislava,Staré Grunty 24,"(48.1581155, 17.06974372915665)",475503
2,Bratislava,Autobusová stanica Mlynské Nivy,"(48.1468327, 17.128482)",475503
3,Banská Bystrica,Ul. 29 augusta 37,"(48.7363849, 19.1632147)",76018
4,Lučenec,Námestie republiky 5994/32,"(48.3307838, 19.6611366)",25902


In [19]:
min_population_index = df_cities_w_Martinus["Population"].idxmin()

min_population_city = df_cities_w_Martinus.loc[min_population_index]

f"{min_population_city['City']} is the smallest city ({min_population_city['Population']}) with Martinus"

'Ilava is the smallest city (5572) with Martinus'

In [20]:
df_cities_wo_Martinus = df_cities[~df_cities["City"].isin(martinus_cities_clean)]
df_cities_wo_Martinus = df_cities_wo_Martinus[
    df_cities_wo_Martinus["Population"] > min_population_city["Population"]
]

print(
    f"{df_cities_wo_Martinus.shape[0]} cities are bigger than Ilava and do not have Martinus"
)
df_cities_wo_Martinus.sort_values("Population", ascending=False).head()

93 cities are bigger than Ilava and do not have Martinus


Unnamed: 0,City,Population
562,Trenčín,54740
489,Prievidza,45017
791,Nové Zámky,37791
2584,Michalovce,36704
2770,Spišská Nová Ves,35431


In [21]:
df_cities_wo_Martinus["Coordinates"] = df_cities_wo_Martinus["City"].apply(
    get_coordinates_for_city
)

df_cities_wo_Martinus.head()

Unnamed: 0,City,Population,Coordinates
17,Malacky,18935,"(48.4362992, 17.0200447)"
21,Stupava,12595,"(48.273044, 17.0329528)"
32,Svätý Jur,5941,"(48.2541879, 17.2127253)"
34,Modra,9346,"(48.9450527, 22.0425486)"
35,Pezinok,24900,"(48.2854539, 17.270194)"


In [22]:
df_cities_wo_Martinus["Nearest_Martinus_km"] = df_cities_wo_Martinus.apply(
    lambda row: min(
        haversine_distance(row["Coordinates"], lib_row["Coordinates"])
        for _, lib_row in df_cities_w_Martinus.iterrows()
    ),
    axis=1,
)

df_cities_wo_Martinus["Nearest_Martinus_km"] = df_cities_wo_Martinus[
    "Nearest_Martinus_km"
].round(2)

df_cities_wo_Martinus.head(5)

Unnamed: 0,City,Population,Coordinates,Nearest_Martinus_km
17,Malacky,18935,"(48.4362992, 17.0200447)",31.15
21,Stupava,12595,"(48.273044, 17.0329528)",13.07
32,Svätý Jur,5941,"(48.2541879, 17.2127253)",13.47
34,Modra,9346,"(48.9450527, 22.0425486)",9.69
35,Pezinok,24900,"(48.2854539, 17.270194)",18.65


In [23]:
df_cities_wo_Martinus.sort_values("Population", ascending=False).head()

Unnamed: 0,City,Population,Coordinates,Nearest_Martinus_km
562,Trenčín,54740,"(48.8922719, 18.0387465)",18.6
489,Prievidza,45017,"(48.7718361, 18.6234916)",37.91
791,Nové Zámky,37791,"(47.9861843, 18.1631415)",36.25
2584,Michalovce,36704,"(48.7514383, 21.9211949)",20.07
2770,Spišská Nová Ves,35431,"(48.9435344, 20.562964)",22.83


In [24]:
df_cities_wo_Martinus.sort_values("Nearest_Martinus_km", ascending=False).head()

Unnamed: 0,City,Population,Coordinates,Nearest_Martinus_km
599,Komárno,32967,"(47.7574079, 18.1298249)",61.36
80,Veľký Meder,8446,"(47.8557806, 17.7693671)",56.05
609,Hurbanovo,7467,"(47.8761227, 18.198313)",47.68
842,Štúrovo,9777,"(47.7978002, 18.7158785)",46.97
615,Kolárovo,10572,"(47.9182347, 17.9966591)",44.14


In [25]:
potential_shops = df_cities_wo_Martinus.sort_values(
    "Nearest_Martinus_km", ascending=True
)[-5:].reset_index()

In [26]:
add_marker_to_map(potential_shops["Coordinates"], "green", martinus_map)
# martinus_map

![Candidates](data/candidates.png)

In [27]:
coordinates = potential_shops["Coordinates"].tolist()

mean_coordinates = tuple(map(lambda x: sum(x) / len(x), zip(*coordinates)))

print(mean_coordinates)

(47.84106922, 18.16200852)


In [28]:
add_marker_to_map([mean_coordinates], "black", martinus_map)
# martinus_map

![Average](data/GPS_average.png)

In [29]:
df_cities_wo_Martinus["Distance_to_Mean"] = df_cities_wo_Martinus["Coordinates"].apply(
    lambda coord: haversine_distance(mean_coordinates, coord)
)

nearest_city = df_cities_wo_Martinus.loc[
    df_cities_wo_Martinus["Distance_to_Mean"].idxmin()
]

print(nearest_city["City"])

Hurbanovo


In [30]:
panta_rhei_website: URL = "https://www.pantarhei.sk/predajne/"
panta_rhei_shops_path: CSSSelector = "#html-body > main > div.container.overflow-hidden > div.row.mt-2.mt-md-3.fz-90-p.gutter-25px-sm > div > address"
panta_rhei = BasicScraper(panta_rhei_website)
panta_rhei_shops = panta_rhei.scrape(panta_rhei_shops_path)
panta_rhei_shops = [shop.strip() for shop in panta_rhei_shops]

assert len(panta_rhei_shops) == 62, "Scraped number does not match real number of shops"

In [31]:
panta_rhei_adresses = [get_coordinates_for_city(shop) for shop in panta_rhei_shops]

In [32]:
all_map = folium.Map(location=[48, 20], zoom_start=8)

hurbanovo: Coordinates = nearest_city["Coordinates"]

add_marker_to_map(df_cities_w_Martinus["Coordinates"], "red", all_map)
add_marker_to_map(panta_rhei_adresses, "blue", all_map)
add_marker_to_map([hurbanovo], "black", all_map)
# all_map

<img src="data/map_w_panta_rhei.png" alt="Map with all shops" width="1800"/>