In [None]:
# Cell 1 — Read cluster table & prepare for GIS merge
from pyspark.sql import SparkSession
import geopandas as gpd
import pandas as pd
import os

# JDBC params
DB_CONFIG = {
    "host":     "localhost",
    "port":     "5432",
    "user":     "postgres",
    "password": "1234",
    "db":       "tourism",
    "driver":   "org.postgresql.Driver"
}

spark = (
    SparkSession.builder
    .appName("09_map")
    .config("spark.jars.packages","org.postgresql:postgresql:42.6.0")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

jdbc_url = f"jdbc:postgresql://{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['db']}"
props = {
    "user": DB_CONFIG["user"],
    "password": DB_CONFIG["password"],
    "driver": DB_CONFIG["driver"]
}

# Load cluster assignments
clusters_df = (
    spark.read.jdbc(jdbc_url, "tourism_clusters", properties=props)
         .toPandas()
)
print("Loaded clusters:")
print(clusters_df.head())

# Map cluster IDs → names
cluster_map = {
    0: "Emerging Markets",
    1: "Mid‑Tier Markets",
    2: "High‑Volume Markets",
    3: "Island Specialists"
}
clusters_df["cluster_name"] = clusters_df["cluster"].map(cluster_map)

# EU states list
eu_states = clusters_df["geo"].tolist()


In [None]:
# Cell 2 — Load GeoJSON & merge with cluster info
# using GISCO 10m 2024 boundaries
url = (
    "https://gisco-services.ec.europa.eu/"
    "distribution/v2/countries/geojson/"
    "CNTR_RG_10M_2024_4326.geojson"
)
gdf = gpd.read_file(url)
gdf_eu = gdf[gdf["NAME_ENGL"].isin(eu_states)]

# merge on country name
gdf_eu = gdf_eu.merge(
    clusters_df[["geo","cluster_name"]],
    left_on="NAME_ENGL",
    right_on="geo"
)


In [None]:
# Cell 3 — Choropleth of EU clusters
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# choose a color palette
colors = ["#377eb8","#e41a1c","#4daf4a","#984ea3"]
cmap = mcolors.ListedColormap(colors)

fig, ax = plt.subplots(1,1, figsize=(10,8))
gdf_eu.plot(
    column="cluster_name",
    categorical=True,
    cmap=cmap,
    legend=True,
    ax=ax,
    edgecolor="white",
    linewidth=0.5,
    legend_kwds={"title":"Market Segment","loc":"lower left"}
)
ax.set_title("EU Country Segments by Occupancy/Capacity", fontsize=16, pad=12)
ax.axis("off")
plt.tight_layout()
plt.show()


In [None]:
# Cell 4 — Choropleth + Labels for Major Countries
import matplotlib.patheffects as pe

majors = ["Ireland","France","Spain","Germany","Finland","Italy","Poland","Greece"]
fig, ax = plt.subplots(1,1, figsize=(10,8))
gdf_eu.plot(
    column="cluster_name",
    categorical=True,
    cmap=cmap,
    legend=True,
    ax=ax,
    edgecolor="white",
    linewidth=0.5,
    legend_kwds={"title":"Market Segment","loc":"lower left"}
)
# annotate majors
for _, row in gdf_eu.iterrows():
    name = row["NAME_ENGL"]
    if name in majors:
        x, y = row.geometry.centroid.x, row.geometry.centroid.y
        ax.text(
            x, y, name,
            ha="center", va="center",
            fontsize=9, fontweight="bold", color="white",
            path_effects=[pe.withStroke(linewidth=2, foreground="black")]
        )

ax.set_title("EU Market Segments (labeled)", fontsize=16, pad=12)
ax.axis("off")
plt.tight_layout()
plt.show()
