In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, TimestampType, DoubleType, IntegerType, DateType
import pyspark.sql.functions as f
import os
from custom_utils import *
import math

In [None]:
spark = SparkSession.\
    builder.\
    appName("merge_weather_and_price_data-notebook").\
    config("spark.mongodb.input.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.mongodb.output.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").\
    getOrCreate()

In [None]:
price_schema = StructType() \
      .add("dateTime",TimestampType(),True) \
      .add("station_uuid",StringType(),True) \
      .add("latitude",DoubleType(),True) \
      .add("longitude",DoubleType(),True) \
      .add("date",DateType(),True) \
      .add("hour",IntegerType(),True) \
      .add("weekday",IntegerType(),True) \
      .add("deviation",DoubleType(),True)

In [None]:
price_data = spark.read.format("csv") \
    .option("header", True) \
    .schema(price_schema) \
    .load(os.path.join(project_base_dir, "outputs/preprocessed_price_data.csv"))

In [None]:
weather_data = spark.read.format("com.mongodb.spark.sql.DefaultSource").load() \
    .withColumn("date2", f.to_date(f.col("date"))) \
    .drop("_id", "date") \
    .withColumnRenamed("date2", "date")

In [None]:
weather_data.show(10)
print(weather_data.dtypes)
print(weather_data.count())

In [None]:
joined_data = price_data.join(weather_data, ["station_uuid", "date", "hour"])

In [None]:
joined_data.show(10)
print(joined_data.count())

In [None]:
cyclical_encoded_data = joined_data \
    .withColumn("hour_sin", f.sin(2 * math.pi * f.col("hour") / 24)) \
    .withColumn("hour_cos", f.cos(2 * math.pi * f.col("hour") / 24)) \
    .withColumn("weekday_sin", f.sin(2 * math.pi * f.col("weekday") / 7)) \
    .withColumn("weekday_cos", f.cos(2 * math.pi * f.col("weekday") / 7)) \
    .drop("hour", "weekday", "date", "dateTime", "latitude", "longitude")

In [None]:
cyclical_encoded_data.show(10)

In [None]:
cyclical_encoded_data.write \
    .option("header",True) \
    .csv(os.path.join(project_base_dir, "outputs/training_data.csv"))