# Average Baseline Model

In [0]:
# ===
#   BASELINE MODEL (PySpark + MLlib)
#   Predicts mean target value from training set
#   Evaluates with RMSE using 5-fold CV and test set
# ===

# Step 1: Install and import PySpark
!pip install pyspark matplotlib
!apt-get update -qq > /dev/null
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Step 1.5: Load libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, lit
import numpy as np

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Start Spark session
spark = SparkSession.builder.master("local[*]").appName("BaselineModel").getOrCreate()

# Step 4: Define file path
# Make sure the file is in *your own Drive*
data_path = "/content/drive/MyDrive/ait614_rutting2/data/processed/rutting_climate_traffic.csv"

# Step 5: Load dataset
df = spark.read.csv(data_path, header=True, inferSchema=True)
print("Data loaded successfully!")
print(f"Total rows: {df.count()}")
df.printSchema()

# Step 6: Split dataset into train/test (80/20)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print("\nData Split:")
print(f"Training set size: {train_df.count()}")
print(f"Test set size: {test_df.count()}")

# Step 7: Define target variable
target = "MAX_MEAN_DEPTH_1_8"

# Step 8: Compute mean target from training set (baseline prediction)
train_mean = train_df.select(F.mean(col(target))).collect()[0][0]
print(f"\nAverage target value (baseline prediction): {train_mean:.3f}")

# Step 9: Add prediction column to test set
test_pred_df = test_df.withColumn("prediction", lit(train_mean))

# Step 10: Evaluate RMSE on test set
evaluator = RegressionEvaluator(
    labelCol=target,
    predictionCol="prediction",
    metricName="rmse"
)

rmse_test = evaluator.evaluate(test_pred_df)
print("\n=== Baseline Model Results ===")
print(f"Average target value (baseline prediction): {train_mean:.3f}")
print(f"Test RMSE: {rmse_test:.3f}")

# Step 11: Compute 5-fold CV RMSE on training set
# Trick: Use LinearRegression with zero features to mimic constant prediction
assembler = VectorAssembler(inputCols=[], outputCol="features")  # empty feature vector
lr_dummy = LinearRegression(
    featuresCol="features",
    labelCol=target,
    predictionCol="prediction",
    fitIntercept=True
)

pipeline = Pipeline(stages=[assembler, lr_dummy])

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=[{}],  # no hyperparameters to tune
    evaluator=evaluator,
    numFolds=5,
    parallelism=2,
    seed=42
)

cv_model = cv.fit(train_df)
best_cv_rmse = min(cv_model.avgMetrics)  # only one configuration
print(f"Average 5-Fold CV RMSE (baseline): {best_cv_rmse:.3f}")

# Step 12: Stop Spark session
spark.stop()
