In [1]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GeneralizedLinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, TimestampType, DoubleType, IntegerType, DateType
import pyspark.sql.functions as f
import os
from custom_utils import project_base_dir

In [None]:
spark = SparkSession.\
    builder.\
    appName("ml_training-notebook").\
    getOrCreate()

In [3]:
training_schema = StructType() \
      .add("station_uuid",StringType(),True) \
      .add("deviation",DoubleType(),True) \
      .add("cloudcover",IntegerType(),True) \
      .add("rain",DoubleType(),True) \
      .add("temperature_2m",DoubleType(),True) \
      .add("hour_sin",DoubleType(),True) \
      .add("hour_cos",DoubleType(),True) \
      .add("weekday_sin",DoubleType(),True) \
      .add("weekday_cos",DoubleType(),True)

In [4]:
training_dataframe = spark.read.format("csv") \
      .option("header", True) \
      .schema(training_schema) \
      .load(os.path.join(project_base_dir, "outputs/training_data.csv"))

In [5]:
input_columns = ["cloudcover", "rain", "temperature_2m", "hour_sin", "hour_cos", "weekday_sin", "weekday_cos"]
assembler = VectorAssembler(
    inputCols=input_columns,
    outputCol="features")

data = assembler.transform(training_dataframe)
final_data = data.select("features", "deviation")

train_data, test_data = final_data.randomSplit([0.9, 0.1], seed=42)

In [6]:
def evaluate_predictions(predictions):
    evaluator = RegressionEvaluator(labelCol="deviation", predictionCol="predicted_deviation", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

    evaluator_r2 = RegressionEvaluator(labelCol="deviation", predictionCol="predicted_deviation", metricName="r2")
    r2 = evaluator_r2.evaluate(predictions)
    print("R-squared (R2) on test data: {:.3f}".format(r2))

In [7]:
def train_and_evaluate_model(model, training_data, test_data):
    l_model = model.fit(training_data)
    predictions = l_model.transform(test_data)
    evaluate_predictions(predictions)
    return l_model

In [8]:
def show_lr_feature_importance(feature_importances, input_columns):
    feature_importance = sorted(list(zip(input_columns, map(abs, feature_importances), feature_importances)), key=lambda x: x[1], reverse=True)

    print("Feature Importance:")
    for feature, importance, coef in feature_importance:
        print("  {}: {:.5f}".format(feature, coef))

### Linear regressor

In [9]:
lr = LinearRegression(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [10]:
lr_model = train_and_evaluate_model(lr, train_data, test_data)

23/10/21 01:50:50 WARN Instrumentation: [e6820aa7] regParam is zero, which might cause numerical instability and overfitting.
23/10/21 01:50:53 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/10/21 01:50:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

Root Mean Squared Error (RMSE) on test data: 0.046
R-squared (R2) on test data: 0.350


[Stage 3:>                                                          (0 + 4) / 4]                                                                                

In [11]:
show_lr_feature_importance(lr_model.coefficients, input_columns)

Feature Importance:
  hour_sin: 0.04549
  hour_cos: -0.00577
  weekday_sin: -0.00040
  weekday_cos: -0.00033
  rain: -0.00032
  temperature_2m: 0.00010
  cloudcover: 0.00001


### Decision tree regressor

In [12]:
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [13]:
dtr_model = train_and_evaluate_model(dtr, train_data, test_data)

                                                                                

Root Mean Squared Error (RMSE) on test data: 0.039
R-squared (R2) on test data: 0.523


In [14]:
show_lr_feature_importance(dtr_model.featureImportances.toArray(), input_columns)

Feature Importance:
  hour_sin: 0.74776
  temperature_2m: 0.13620
  hour_cos: 0.11288
  cloudcover: 0.00190
  rain: 0.00087
  weekday_cos: 0.00038
  weekday_sin: 0.00000


### Random forest regressor

In [15]:
rfr = RandomForestRegressor(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [16]:
rfr_model = train_and_evaluate_model(rfr, train_data, test_data)



Root Mean Squared Error (RMSE) on test data: 0.040
R-squared (R2) on test data: 0.510


In [17]:
show_lr_feature_importance(rfr_model.featureImportances.toArray(), input_columns)

Feature Importance:
  hour_sin: 0.67783
  hour_cos: 0.21119
  temperature_2m: 0.09909
  cloudcover: 0.00904
  weekday_sin: 0.00157
  rain: 0.00088
  weekday_cos: 0.00039


### Generalised linear regression

In [18]:
glm = GeneralizedLinearRegression(featuresCol="features",
                                  labelCol="deviation",
                                  predictionCol="predicted_deviation",
                                  family="gaussian",
                                  link="identity",
                                  maxIter=10,
                                  regParam=0.3)

In [19]:
glm_model = train_and_evaluate_model(glm, train_data, test_data)

                                                                                

Root Mean Squared Error (RMSE) on test data: 0.053
R-squared (R2) on test data: 0.112


In [20]:
show_lr_feature_importance(glm_model.coefficients, input_columns)

Feature Importance:
  hour_sin: 0.00729
  hour_cos: -0.00301
  rain: 0.00020
  weekday_sin: -0.00006
  temperature_2m: -0.00003
  weekday_cos: -0.00003
  cloudcover: 0.00001
