In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, IntegerType, DateType, StructField, StringType, TimestampType
import logging, traceback
import requests

In [None]:
URL_PREFIX = 'https://noaa-ghcn-pds.s3.amazonaws.com'
TEMP_STORAGE_PATH = '/home/marcos/ghcn-d/spark/data'

In [None]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [None]:
def download_file(url, local_file_path):
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_file_path

In [None]:
year = 2020
csv_file_name = f'/{year}.csv'
dataset_url = URL_PREFIX + '/csv' + csv_file_name
csv_file_path = TEMP_STORAGE_PATH + csv_file_name

In [None]:
download_file(dataset_url, csv_file_path)    

In [None]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("element", StringType(), True),   
    StructField("value", IntegerType(), True),   
    StructField("m_flag", StringType(), True),   
    StructField("q_flag", StringType(), True),   
    StructField("s_flag", StringType(), True),
    StructField("obs_time",IntegerType(), True)
])

In [None]:
df = spark.read \
    .options(header=False) \
    .schema(schema) \
    .csv(csv_file_path)

In [None]:
df = df.withColumn("date", F.to_date(df.date.cast("string"), "yyyyMMdd"))

In [None]:
df = df \
    .drop("q_flag") \
    .withColumn("tmax", 
        F.when(df.element == "TMAX", 
            F.when(df.value > 700, None).otherwise(
                F.when(df.value < -700, None). otherwise(
                    df.value.cast("double")/10)
                )
        ).otherwise("None")
     ) \
    .withColumn("tmin", 
        F.when(df.element == "TMIN", 
            F.when(df.value > 700, None).otherwise(
                F.when(df.value < -700, None). otherwise(
                    df.value.cast("double")/10)
                )
        ).otherwise("None")
     ) \
    .withColumn("prcp", F.when(df.element == "PRCP", df.value.cast("double")).otherwise(None)) \
    .withColumn("snow", F.when(df.element == "SNOW", df.value.cast("double")).otherwise(None)) \
    .withColumn("snwd", F.when(df.element == "SNWD", df.value.cast("double")).otherwise(None)) \
    .groupBy("id", "date").agg( 
        F.avg("tmax"),
        F.avg("tmin"),
        F.avg("prcp"),
        F.avg("snow"),
        F.avg("snwd"),
        F.first("m_flag"),
        F.first("s_flag")
    ).show()
   