# Random Forest Model

In [0]:
# ===
# PySpark Random Forest Regressor with Hyperparameter Tuning
# Hyperparameter tuning + 5-fold CV + Feature Importances + Testing
# ===

# Step 1: Install and import dependencies
!pip install pyspark matplotlib
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Step 1.5: Load modules
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import pandas as pd

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Start Spark session
spark = SparkSession.builder.appName("RandomForestTuning").getOrCreate()

# Step 3: Load dataset
data_path = "/content/drive/MyDrive/ait614_rutting2/data/processed/rutting_climate_traffic.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

print(f"Total rows: {df.count()}")
df.printSchema()

# Step 4: Define features and target
features = [
    "REL_HUM_AVG_AVG",
    "PRECIPITATION",
    "EVAPORATION",
    "PRECIP_DAYS",
    "CLOUD_COVER_AVG",
    "SHORTWAVE_SURFACE_AVG",
    "TEMP_AVG",
    "FREEZE_INDEX",
    "FREEZE_THAW",
    "WIND_VELOCITY_AVG",
    "AADTT_VEH_CLASS_4_TREND",
    "AADTT_VEH_CLASS_5_TREND",
    "AADTT_VEH_CLASS_6_TREND",
    "AADTT_VEH_CLASS_7_TREND",
    "AADTT_VEH_CLASS_8_TREND",
    "AADTT_VEH_CLASS_9_TREND",
    "AADTT_VEH_CLASS_10_TREND",
    "AADTT_VEH_CLASS_11_TREND",
    "AADTT_VEH_CLASS_12_TREND",
    "AADTT_VEH_CLASS_13_TREND"
]
target = "MAX_MEAN_DEPTH_1_8"

# Step 6: Split dataset into train/test (80/20)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print(f"Training set size: {train_df.count()}")
print(f"Test set size: {test_df.count()}")
print(f"Number of features: {len(features)}")
print(f"Number of targets: {len([target])}")

# Step 7: Assemble feature vector
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Step 8: Define Random Forest Regressor
rf = RandomForestRegressor(
    labelCol=target,
    featuresCol="features",
    seed=42
)

# Step 9: Build pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Step 10: Define hyperparameter grid for tuning
paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [20, 100])                # default=20, try 100 for improvement
    .addGrid(rf.maxDepth, [5, 10])                 # default=5, test deeper trees
    .addGrid(rf.minInstancesPerNode, [1, 4])       # default=1, stricter leaf requirement
    .addGrid(rf.featureSubsetStrategy, ["auto", "sqrt"])  # common options
    .build())

# Step 11: Define evaluator
evaluator = RegressionEvaluator(
    labelCol=target,
    predictionCol="prediction",
    metricName="rmse"
)

# Step 12: Cross-validation for hyperparameter tuning
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,       # 5-fold CV
    parallelism=2,
    seed=42
)

print("Training Random Forest with 5-fold CV (this may take several minutes)...")
cv_model = cv.fit(train_df)

# Step 13: Get 5-fold CV RMSE for best model configuration
best_cv_rmse = min(cv_model.avgMetrics)  # avg RMSE across folds for best hyperparameters
print("\n=== Random Forest Results ===")
print(f"5-Fold CV RMSE (best hyperparameters): {best_cv_rmse:.3f}")

# Step 14: Get best model
best_model = cv_model.bestModel

# Step 15: Show best hyperparameters
best_rf = best_model.stages[-1]
print("\n=== Best Hyperparameters ===")
print(f"numTrees: {best_rf.getOrDefault('numTrees')}")
print(f"maxDepth: {best_rf.getOrDefault('maxDepth')}")
print(f"minInstancesPerNode: {best_rf.getOrDefault('minInstancesPerNode')}")
print(f"featureSubsetStrategy: {best_rf.getOrDefault('featureSubsetStrategy')}")

# Step 16: Feature Importance Plot
importances = best_rf.featureImportances
feature_importances = [(features[i], importances[i]) for i in range(len(features))]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

fi_df = pd.DataFrame(feature_importances, columns=["Feature", "Importance"])

plt.figure(figsize=(10, 6))
plt.barh(fi_df["Feature"], fi_df["Importance"], color="steelblue")
plt.gca().invert_yaxis()
plt.title("Random Forest Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [0]:
# --- Evaluate best Random Forest model on the test set ---
test_predictions = best_model.transform(test_df)

# Compute test RMSE
rmse_test = evaluator.evaluate(test_predictions)

print("\n=== Random Forest Test Set Evaluation ===")
print(f"Test RMSE: {rmse_test:.3f}")

In [0]:
# Step 16: Stop Spark session
spark.stop()