In [None]:
# Cell 1 — SparkSession & Read Data
from pyspark.sql import SparkSession
import os

DB_CONFIG = {
    "host":     "localhost",
    "port":     "5432",
    "user":     "postgres",
    "password": "1234",
    "db":       "tourism",
    "driver":   "org.postgresql.Driver"
}

spark = (
    SparkSession.builder
    .appName("07_model")
    .config("spark.jars.packages","org.postgresql:postgresql:42.6.0")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

jdbc_url = f"jdbc:postgresql://{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['db']}"
props = {"user": DB_CONFIG["user"], "password": DB_CONFIG["password"], "driver": DB_CONFIG["driver"]}

# load the merged table
pdf = spark.read.jdbc(jdbc_url, "tourism_merged_yearly", properties=props).toPandas()

# Cell 2 — Modeling with XGBoost & RF
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

# prepare features
pdf["lag1_occ"] = pdf.sort_values(["geo","year"]).groupby("geo")["occupancy_rate"].shift(1)
pdf = pdf.dropna()
X = pdf[["lag1_occ","log_capacity_sum","year"]]
y = pdf["occupancy_rate"]

# split & train XGBoost
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False,random_state=42)
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train,y_train)
y_pred = xgb_model.predict(X_test)

# metrics
rmse = sqrt(mean_squared_error(y_test,y_pred))
mae  = mean_absolute_error(y_test,y_pred)
r2   = r2_score(y_test,y_pred)
print(f"XGB → RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}")

# train RF & residuals
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train,y_train)
y_rf = rf.predict(X_test)

rmse_rf = sqrt(mean_squared_error(y_test,y_rf))
print(f"RF  → RMSE: {rmse_rf:.2f}")

# feature importances
import matplotlib.pyplot as plt
import numpy as np
feat_imp = rf.feature_importances_
feat_names = X_train.columns

plt.figure(figsize=(6,4))
ypos = np.arange(len(feat_names))
plt.barh(ypos, feat_imp, edgecolor='k')
plt.yticks(ypos, feat_names)
plt.xlabel("Importance")
plt.title("RF Feature Importances")
plt.show()

# residual plot
residuals = y_test - y_rf
plt.figure(figsize=(6,4))
plt.scatter(y_rf, residuals, alpha=0.7, edgecolor='k')
plt.axhline(0, linestyle='--')
plt.xlabel("Predicted Occupancy")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted (RF)")
plt.show()

# Cell 3 — Save Model
import joblib
joblib.dump(rf, "tourism_rf_model.pkl")
print("RF model saved")
