In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType
import pyspark.sql.functions as f
import os
from custom_utils import *
import requests
import json
from io import StringIO
import pandas as pd
from calendar import monthrange

In [None]:
spark = SparkSession.\
    builder.\
    appName("load_weather_data-notebook").\
    config("spark.mongodb.input.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.mongodb.output.uri","mongodb://127.0.0.1:27017/dic.weather").\
    config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").\
    getOrCreate()

In [None]:
stations_schema = StructType() \
      .add("station_uuid",StringType(),True) \
      .add("latitude",DoubleType(),True) \
      .add("longitude",DoubleType(),True) \
      .add("city",StringType(),True)

In [None]:
stations_data = spark.read.format("csv") \
      .schema(stations_schema) \
      .load(os.path.join(project_base_dir, "outputs/selected_stations_unique.csv"))

In [None]:
stations_df = stations_data.toPandas()

In [None]:
def retrieve_weather_data(latitude, longitude, start_date, end_date):
    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,rain,cloudcover"
    response = json.loads(requests.get(url).text)
    return response["hourly"]

In [None]:
def weather_data_to_df(uuid, weather_data):
    df = pd.DataFrame.from_dict(weather_data)
    df["station_uuid"] = uuid
    df['dateTime'] = pd.to_datetime(df['time'], format="%Y-%m-%dT%H:%M")
    df["date"] = df["dateTime"].dt.date
    df["hour"] = df["dateTime"].dt.hour
    del df["time"]
    del df["dateTime"]
    return df

In [None]:
def create_date_strings(year, start_month, end_month):
    return f"{year}-{start_month:02d}-01", f"{year}-{end_month:02d}-{monthrange(year, end_month)[1]:02d}"

In [None]:
def create_dataframe(pd_dfs):
    joined_df = pd.concat(pd_dfs)
    return spark.createDataFrame(joined_df)

In [None]:
def get_weather_dataframe(stations):
    dfs = []
    start_date, end_date = create_date_strings(year, min(months), max(months))
    for _, row in stations.iterrows():
        weather_dict = retrieve_weather_data(row["latitude"], row["longitude"], start_date, end_date)
        weather_df = weather_data_to_df(row["station_uuid"], weather_dict)
        dfs.append(weather_df)
    return create_dataframe(dfs)

In [None]:
weather_data_per_station = get_weather_dataframe(stations_df)

In [None]:
weather_data_per_station.show(10)
print(weather_data_per_station.count())
print(weather_data_per_station.dtypes)

In [None]:
weather_data_per_station.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()