In [None]:
# Cell 1 — SparkSession & Read Merged Yearly Data
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
import os

# JDBC / Airflow‑injected params
DB_CONFIG = {
    "host":     "localhost",
    "port":     "5432",
    "user":     "postgres",
    "password": "1234",
    "db":       "tourism",
    "driver":   "org.postgresql.Driver"
}

spark = (
    SparkSession.builder
    .appName("08_cluster")
    .config("spark.jars.packages","org.postgresql:postgresql:42.6.0")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

jdbc_url = f"jdbc:postgresql://{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['db']}"
props = {
    "user": DB_CONFIG["user"],
    "password": DB_CONFIG["password"],
    "driver": DB_CONFIG["driver"]
}

# Load the merged occupancy+capacity yearly table
df = spark.read.jdbc(jdbc_url, "tourism_merged_yearly", properties=props)

# EU member states list
eu_states = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czechia","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Ireland","Italy",
    "Latvia","Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal",
    "Romania","Slovakia","Slovenia","Spain","Sweden"
]


In [None]:
# Cell 2 — Build country_summary for clustering
#  avg occupancy_rate & avg capacity per country
country_summary = (
    df.filter(col("geo").isin(eu_states))
      .groupBy("geo")
      .agg(
        avg("occupancy_rate").alias("mean_occupancy"),
        avg("log_capacity_sum").alias("mean_log_capacity")
      )
      .toPandas()
)

print("Country summary:")
print(country_summary.head())


In [None]:
# Cell 3 — Silhouette search & KMeans clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(country_summary[["mean_occupancy","mean_log_capacity"]])

# find best k
scores = []
for k in range(2,7):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_scaled)
    scores.append((k, silhouette_score(X_scaled, labels)))

best_k, best_score = max(scores, key=lambda x: x[1])
print(f"Silhouette scores by k: {scores}")
print(f"→ Selecting k = {best_k} (score={best_score:.3f})")

# fit final model
kmeans = KMeans(n_clusters=best_k, random_state=42)
country_summary["cluster"] = kmeans.fit_predict(X_scaled)

print("\nCluster assignments:")
print(country_summary.sort_values("cluster")[["geo","mean_occupancy","mean_log_capacity","cluster"]])


In [None]:
# Cell 4 — Persist clusters to Postgres
# so mapping notebook can read them
cluster_sdf = spark.createDataFrame(country_summary[["geo","cluster"]])

cluster_sdf.write.mode("overwrite") \
    .jdbc(jdbc_url, "tourism_clusters", properties=props)

print("Written cluster assignments to table 'tourism_clusters'")
