In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, IntegerType, DateType, StructField, StringType, TimestampType
import logging, traceback
import requests

In [2]:
URL_PREFIX = 'https://noaa-ghcn-pds.s3.amazonaws.com'
TEMP_STORAGE_PATH = '/home/marcos/ghcn-d/spark/data'

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/04 10:24:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def download_file(url, local_file_path):
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_file_path

In [52]:
year = 1888
csv_file_name = f'/{year}.csv'
dataset_url = URL_PREFIX + '/csv' + csv_file_name
csv_file_path = TEMP_STORAGE_PATH + csv_file_name

In [53]:
download_file(dataset_url, csv_file_path)    

'/home/marcos/ghcn-d/spark/data/1888.csv'

In [54]:
df_stations = spark.read.parquet('./data/ghcnd-stations.parquet') \
  .drop('state', 'gsn_flag', 'hcn_crn_flag', 'wmo_id') \
  .withColumnRenamed('name', 'station_name') \
  .withColumnRenamed('id', 'station_id') \
  .withColumn('country_code', F.substring('station_id', 0, 2))

df_countries = spark.read.parquet('./data/ghcnd-countries.parquet') \
  .withColumnRenamed('name', 'country_name')

In [55]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("element", StringType(), True),   
    StructField("value", IntegerType(), True),   
    StructField("m_flag", StringType(), True),   
    StructField("q_flag", StringType(), True),   
    StructField("s_flag", StringType(), True),
    StructField("obs_time",IntegerType(), True)
])

In [56]:
df = spark.read \
    .options(header=False) \
    .schema(schema) \
    .csv(csv_file_path)

In [57]:
df = df.withColumn("date", F.to_date(df.date.cast("string"), "yyyyMMdd"))

In [58]:
print(f'processing year {year}...')
  # Only used when reading from csv in order to convert to date. 
  # If reading from BQ, this is already done
  # df = df.withColumn("date", F.to_date(df.date.cast("string"), "yyyyMMdd"))

df = df \
.drop("q_flag") \
.withColumn("tmax", 
      F.when(df.element == "TMAX", 
          F.when(df.value > 700, None).otherwise(
              F.when(df.value < -700, None). otherwise(
                  df.value.cast("double")/10)
              )
      ).otherwise("None")
  ) \
  .withColumn("tmin", 
      F.when(df.element == "TMIN", 
          F.when(df.value > 700, None).otherwise(
              F.when(df.value < -700, None). otherwise(
                  df.value.cast("double")/10)
              )
      ).otherwise("None")
  ) \
  .withColumn("prcp", F.when(df.element == "PRCP", df.value.cast("double")).otherwise(None)) \
  .withColumn("snow", F.when(df.element == "SNOW", df.value.cast("double")).otherwise(None)) \
  .withColumn("snwd", F.when(df.element == "SNWD", df.value.cast("double")).otherwise(None))

df_daily = df \
    .withColumn("date", F.trunc("date", "year")) \
    .groupBy("id", "date").agg( 
      F.avg("tmax"),
      F.avg("tmin"),
      F.avg("prcp"),
      F.avg("snow"),
      F.avg("snwd"),
      F.first("m_flag"),
      F.first("s_flag")
    ) \
    .join(df_stations, df.id == df_stations.station_id, "inner") \
    .join(df_countries, df_stations.country_code == df_countries.code, "inner") \
    .drop ('station_id', 'code') \
    .toDF('id','date','tmax','tmin','prcp','snow','snwd','m_flag','s_flag','latitude','longitude','elevation','station_name','country_code','country_name') \

   
# Note: toDF after joins, otherwise join will raise error
# Note: toDF since BQ does not allow field names with () and average generates these kind of names avg(tmax)
 


processing year 1888...


In [59]:
df_daily.show()



+-----------+----------+------------------+-------------------+------------------+----+----+------+------+--------+---------+---------+--------------------+------------+------------+
|         id|      date|              tmax|               tmin|              prcp|snow|snwd|m_flag|s_flag|latitude|longitude|elevation|        station_name|country_code|country_name|
+-----------+----------+------------------+-------------------+------------------+----+----+------+------+--------+---------+---------+--------------------+------------+------------+
|AGE00135039|1888-01-01| 22.00986301369863| 13.602203856749313|12.603825136612022|null|null|  null|     E| 35.7297|     0.65|     50.0|ORAN-HOPITAL MILI...|          AG|     Algeria|
|AGE00147705|1888-01-01|22.432417582417592| 14.934615384615386|22.713498622589533|null|null|  null|     E|   36.78|     3.07|     59.0|ALGIERS-VILLE/UNI...|          AG|     Algeria|
|AGE00147708|1888-01-01|23.526315789473685| 11.220221606648199|19.390581717451525|nul

                                                                                

In [60]:
df_daily.schema

StructType(List(StructField(id,StringType,true),StructField(date,DateType,true),StructField(tmax,DoubleType,true),StructField(tmin,DoubleType,true),StructField(prcp,DoubleType,true),StructField(snow,DoubleType,true),StructField(snwd,DoubleType,true),StructField(m_flag,StringType,true),StructField(s_flag,StringType,true),StructField(latitude,DoubleType,true),StructField(longitude,DoubleType,true),StructField(elevation,DoubleType,true),StructField(station_name,StringType,true),StructField(country_code,StringType,true),StructField(country_name,StringType,true)))