# Gradient-Boosted Decision Trees Model

In [0]:
# ===
# PySpark Gradient-Boosted Trees with Hyperparameter Tuning
# Hyperparameter tuning + 5-fold CV + Feature Importances
# ===

# Step 1: Install and import dependencies
!pip install pyspark matplotlib
!apt-get update -qq > /dev/null
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Step 1.5: Load modules
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import pandas as pd

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Start Spark session
spark = SparkSession.builder.master("local[*]").appName("GBTTuning").getOrCreate()

# Step 4: Load dataset
data_path = "/content/drive/MyDrive/ait614_rutting2/data/processed/rutting_climate_traffic.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

print(f"Total rows: {df.count()}")
df.printSchema()

# Step 5: Define features and target
features = [
    "REL_HUM_AVG_AVG",
    "PRECIPITATION",
    "EVAPORATION",
    "PRECIP_DAYS",
    "CLOUD_COVER_AVG",
    "SHORTWAVE_SURFACE_AVG",
    "TEMP_AVG",
    "FREEZE_INDEX",
    "FREEZE_THAW",
    "WIND_VELOCITY_AVG",
    "AADTT_VEH_CLASS_4_TREND",
    "AADTT_VEH_CLASS_5_TREND",
    "AADTT_VEH_CLASS_6_TREND",
    "AADTT_VEH_CLASS_7_TREND",
    "AADTT_VEH_CLASS_8_TREND",
    "AADTT_VEH_CLASS_9_TREND",
    "AADTT_VEH_CLASS_10_TREND",
    "AADTT_VEH_CLASS_11_TREND",
    "AADTT_VEH_CLASS_12_TREND",
    "AADTT_VEH_CLASS_13_TREND"
]
target = "MAX_MEAN_DEPTH_1_8"

# Step 6: Train/test split (80/20)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print(f"Training set size: {train_df.count()}")
print(f"Test set size: {test_df.count()}")
print(f"Number of features: {len(features)}")
print(f"Number of targets: {len([target])}")

# Step 7: Vector Assembler
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Step 8: Define GBT Regressor
gbt = GBTRegressor(
    labelCol=target,
    featuresCol="features",
    seed=42
)

# Step 9: Pipeline with assembler + model
pipeline = Pipeline(stages=[assembler, gbt])

# Step 10: Hyperparameter Grid
paramGrid = (ParamGridBuilder()
    .addGrid(gbt.maxDepth, [3, 5])           # depth
    .addGrid(gbt.maxIter, [20, 50])        # iterations
    .addGrid(gbt.stepSize, [0.1, 0.2])     # learning rate
    .build()
)

# Step 11: Evaluator
evaluator = RegressionEvaluator(
    labelCol=target,
    predictionCol="prediction",
    metricName="rmse"
)

# Step 12: Cross-validation
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,
    parallelism=2,
    seed=42
)

print("Training Gradient-Boosted Trees with 5-fold CV (this may take a while)...")
cv_model = cv.fit(train_df)

# Step 13: Best RMSE from CV
best_cv_rmse = min(cv_model.avgMetrics)
print("\n=== Gradient-Boosted Trees Results ===")
print(f"5-Fold CV RMSE (best hyperparameters): {best_cv_rmse:.3f}")

# Step 14: Extract best model
best_model = cv_model.bestModel
best_gbt = best_model.stages[-1]

# Step 15: Print best hyperparameters
print("\n=== Best Hyperparameters ===")
print(f"maxDepth: {best_gbt.getOrDefault('maxDepth')}")
print(f"maxIter: {best_gbt.getOrDefault('maxIter')}")
print(f"stepSize: {best_gbt.getOrDefault('stepSize')}")

# Step 16: Feature Importances
importances = best_gbt.featureImportances
feature_importances = [(features[i], importances[i]) for i in range(len(features))]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

fi_df = pd.DataFrame(feature_importances, columns=["Feature", "Importance"])

plt.figure(figsize=(10, 6))
plt.barh(fi_df["Feature"], fi_df["Importance"])
plt.gca().invert_yaxis()
plt.title("GBT Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Stop Spark session
spark.stop()
