In [1]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GeneralizedLinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, DateType
import pyspark.sql.functions as f
import os
from custom_utils import project_base_dir

In [2]:
spark = SparkSession.\
    builder.\
    appName("ml_training-notebook").\
    getOrCreate()

23/10/21 11:37:08 WARN Utils: Your hostname, DIC resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/10/21 11:37:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/21 11:37:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/21 11:37:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/21 11:37:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/10/21 11:37:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
training_schema = StructType() \
      .add("station_uuid",StringType(),True) \
      .add("date",DateType(),True) \
      .add("hour",IntegerType(),True) \
      .add("deviation",DoubleType(),True) \
      .add("cloudcover",IntegerType(),True) \
      .add("rain",DoubleType(),True) \
      .add("temperature_2m",DoubleType(),True) \
      .add("hour_sin",DoubleType(),True) \
      .add("hour_cos",DoubleType(),True) \
      .add("weekday_sin",DoubleType(),True) \
      .add("weekday_cos",DoubleType(),True)

In [4]:
training_dataframe = spark.read.format("csv") \
      .option("header", True) \
      .schema(training_schema) \
      .load(os.path.join(project_base_dir, "outputs/training_data.csv"))

In [5]:
input_columns = ["cloudcover", "rain", "temperature_2m", "hour_sin", "hour_cos", "weekday_sin", "weekday_cos"]
assembler = VectorAssembler(
    inputCols=input_columns,
    outputCol="features")

data = assembler.transform(training_dataframe)
final_data = data.select("features", "deviation")

train_data, test_data = final_data.randomSplit([0.9, 0.1], seed=42)

In [6]:
def evaluate_predictions(predictions):
    evaluator = RegressionEvaluator(labelCol="deviation", predictionCol="predicted_deviation", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

    evaluator_r2 = RegressionEvaluator(labelCol="deviation", predictionCol="predicted_deviation", metricName="r2")
    r2 = evaluator_r2.evaluate(predictions)
    print("R-squared (R2) on test data: {:.3f}".format(r2))

In [7]:
def train_and_evaluate_model(model, training_data, test_data):
    l_model = model.fit(training_data)
    predictions = l_model.transform(test_data)
    evaluate_predictions(predictions)
    return l_model

In [8]:
def show_lr_feature_importance(feature_importances, input_columns):
    feature_importance = sorted(list(zip(input_columns, map(abs, feature_importances), feature_importances)), key=lambda x: x[1], reverse=True)

    print("Feature Importance:")
    for feature, importance, coef in feature_importance:
        print("  {}: {:.5f}".format(feature, coef))

### Linear regressor

In [9]:
lr = LinearRegression(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [10]:
lr_model = train_and_evaluate_model(lr, train_data, test_data)

23/10/21 11:37:15 WARN Instrumentation: [f3aa1f0a] regParam is zero, which might cause numerical instability and overfitting.
23/10/21 11:37:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/10/21 11:37:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

Root Mean Squared Error (RMSE) on test data: 0.046
R-squared (R2) on test data: 0.349


In [11]:
show_lr_feature_importance(lr_model.coefficients, input_columns)

Feature Importance:
  hour_sin: 0.04551
  hour_cos: -0.00566
  weekday_sin: -0.00055
  rain: -0.00030
  weekday_cos: -0.00019
  temperature_2m: 0.00009
  cloudcover: 0.00001


### Decision tree regressor

In [12]:
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [13]:
dtr_model = train_and_evaluate_model(dtr, train_data, test_data)



Root Mean Squared Error (RMSE) on test data: 0.039
R-squared (R2) on test data: 0.522


In [14]:
show_lr_feature_importance(dtr_model.featureImportances.toArray(), input_columns)

Feature Importance:
  hour_sin: 0.75694
  temperature_2m: 0.13485
  hour_cos: 0.10451
  cloudcover: 0.00320
  weekday_cos: 0.00049
  rain: 0.00000
  weekday_sin: 0.00000


### Random forest regressor

In [15]:
rfr = RandomForestRegressor(featuresCol="features", labelCol="deviation", predictionCol="predicted_deviation")

In [16]:
rfr_model = train_and_evaluate_model(rfr, train_data, test_data)



Root Mean Squared Error (RMSE) on test data: 0.040
R-squared (R2) on test data: 0.501


In [17]:
show_lr_feature_importance(rfr_model.featureImportances.toArray(), input_columns)

Feature Importance:
  hour_sin: 0.67903
  hour_cos: 0.21086
  temperature_2m: 0.10058
  cloudcover: 0.00544
  rain: 0.00213
  weekday_sin: 0.00112
  weekday_cos: 0.00085


### Generalised linear regression

In [18]:
glm = GeneralizedLinearRegression(featuresCol="features",
                                  labelCol="deviation",
                                  predictionCol="predicted_deviation",
                                  family="gaussian",
                                  link="identity",
                                  maxIter=10,
                                  regParam=0.3)

In [19]:
glm_model = train_and_evaluate_model(glm, train_data, test_data)

                                                                                

Root Mean Squared Error (RMSE) on test data: 0.053
R-squared (R2) on test data: 0.111


In [20]:
show_lr_feature_importance(glm_model.coefficients, input_columns)

Feature Importance:
  hour_sin: 0.00729
  hour_cos: -0.00300
  rain: 0.00020
  weekday_sin: -0.00007
  temperature_2m: -0.00003
  cloudcover: 0.00001
  weekday_cos: -0.00000
