# MODEL PERFORMANCE EVALUATION


## 1. Prepare the dataset

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType, DoubleType
from pyspark.ml.feature import FeatureHasher, VectorAssembler

df = spark.table("data.energy_volume.energy_power_data")

df = (
    df.withColumn("timestamp", F.col("timestamp").cast(TimestampType()))
      .withColumn("energy_kWh", F.col("energy_kWh").cast(DoubleType()))
      .withColumn("carbon_kg", F.col("carbon_kg").cast(DoubleType()))
      .withColumn("hour", F.hour("timestamp"))
      .dropna()
)

# Hash categorical features (low memory)
hasher = FeatureHasher(
    inputCols=["device_type", "location"],
    outputCol="hashed_features",
    numFeatures=32
)

hashed = hasher.transform(df)

# Add numeric features (this is the fix)
assembler = VectorAssembler(
    inputCols=["hashed_features", "hour", "energy_kWh"],
    outputCol="features"
)

model_data = assembler.transform(hashed).select("features", "carbon_kg")

train, test = model_data.randomSplit([0.8, 0.2], seed=42)

## 2. Create modeling dataset(only numeric features)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["hour"],
    outputCol="features"
)

data = assembler.transform(df).select("features", "carbon_kg")
train, test = data.randomSplit([0.8, 0.2], seed=42)

## 3. Generalized linear regression

In [0]:
from pyspark.ml.regression import GeneralizedLinearRegression

glm = GeneralizedLinearRegression(featuresCol="features", labelCol="carbon_kg")
glm_model = glm.fit(train)
pred_glm = glm_model.transform(test)

## 4. Decision trees

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol="features", labelCol="carbon_kg", maxDepth=3)
dt_model = dt.fit(train)
pred_dt = dt_model.transform(test)

## 5. Small sample GBT

In [0]:
small_train = train.sample(False, 0.08, seed=42)
small_test = test.sample(False, 0.08, seed=42)

from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol="features", labelCol="carbon_kg", maxIter=5, maxDepth=3)
gbt_model = gbt.fit(small_train)
pred_gbt = gbt_model.transform(small_test)

## 6. Isolation forest to detect anamolies

In [0]:
import numpy as np

# Convert Spark DataFrame to Pandas DataFrame
train_pd = train.toPandas()
test_pd = test.toPandas()

# Convert 'features' column from DenseVector to numpy array
train_features = np.vstack(train_pd["features"].apply(lambda x: np.array(x)))
test_features = np.vstack(test_pd["features"].apply(lambda x: np.array(x)))

iso = IsolationForest(
    contamination=0.03,
    random_state=42
)
iso.fit(train_features)
test_pd["anomaly_label"] = iso.predict(test_features)
test_pd["anomaly_score"] = iso.decision_function(test_features)

##7. Evaluate 

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="carbon_kg", predictionCol="prediction")

def score(pred, name):
    rmse = eval.evaluate(pred, {eval.metricName: "rmse"})
    r2 = eval.evaluate(pred, {eval.metricName: "r2"})
    print(f"{name} → RMSE: {rmse:.4f}, R²: {r2:.4f}")

score(pred_glm, "GLM")
score(pred_dt, "Decision Tree")
score(pred_gbt, "GBT (sampled)")

GLM → RMSE: 0.3919, R²: -0.0005
Decision Tree → RMSE: 0.3921, R²: -0.0014
GBT (sampled) → RMSE: 0.3634, R²: -0.0170
