In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, TimestampType, DoubleType, IntegerType, DateType
import pyspark.sql.functions as f
import os
from custom_utils import *
import math

In [2]:
spark = SparkSession.\
    builder.\
    appName("merge_weather_and_price_data-notebook").\
    config("spark.mongodb.input.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.mongodb.output.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").\
    getOrCreate()

23/10/21 12:27:12 WARN Utils: Your hostname, DIC resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/10/21 12:27:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/vboxuser/anaconda3/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vboxuser/.ivy2/cache
The jars for the packages stored in: /home/vboxuser/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-41ce6a1c-ced9-484c-af40-bd3cd6ca9258;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 172ms :: artifacts dl 9ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifa

In [3]:
price_schema = StructType() \
      .add("dateTime",TimestampType(),True) \
      .add("station_uuid",StringType(),True) \
      .add("latitude",DoubleType(),True) \
      .add("longitude",DoubleType(),True) \
      .add("date",DateType(),True) \
      .add("hour",IntegerType(),True) \
      .add("weekday",IntegerType(),True) \
      .add("deviation",DoubleType(),True)

In [4]:
price_data = spark.read.format("csv") \
    .option("header", True) \
    .schema(price_schema) \
    .load(os.path.join(project_base_dir, "outputs/preprocessed_price_data.csv"))

In [5]:
weather_data = spark.read.format("com.mongodb.spark.sql.DefaultSource").load() \
    .withColumn("date2", f.to_date(f.col("date"))) \
    .drop("_id", "date") \
    .withColumnRenamed("date2", "date")

In [6]:
if display_intermediate_dataframes:
    weather_data.show(10)
    print(weather_data.dtypes)
    print(weather_data.count())

In [7]:
joined_data = price_data.join(weather_data, ["station_uuid", "date", "hour"])

In [8]:
if display_intermediate_dataframes:
    joined_data.show(10)
    print(joined_data.count())

In [9]:
cyclical_encoded_data = joined_data \
    .withColumn("hour_sin", f.sin(2 * math.pi * f.col("hour") / 24)) \
    .withColumn("hour_cos", f.cos(2 * math.pi * f.col("hour") / 24)) \
    .withColumn("weekday_sin", f.sin(2 * math.pi * f.col("weekday") / 7)) \
    .withColumn("weekday_cos", f.cos(2 * math.pi * f.col("weekday") / 7)) \
    .drop("dateTime", "latitude", "longitude")

In [10]:
if display_intermediate_dataframes:
    cyclical_encoded_data.show(10)

In [11]:
cyclical_encoded_data.write \
    .option("header",True) \
    .csv(os.path.join(project_base_dir, "outputs/training_data.csv"))

                                                                                