In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, IntegerType, DateType, StructField, StringType, TimestampType
import logging, traceback
import requests
import sys

In [2]:
"""
# Used when submitting job to spark master with parameters
start_year = int(sys.argv[1])
end_year = int(sys.argv[2])
"""
start_year = 2022
end_year = 2022

In [3]:
# For ingestion to local (used when developing)
URL_PREFIX = 'https://noaa-ghcn-pds.s3.amazonaws.com'
TEMP_STORAGE_PATH = '/home/marcos/ghcn-d/spark/data'

In [4]:
# For local spark master
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.24.2,com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.0") \
    .getOrCreate()



:: loading settings :: url = jar:file:/opt/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/marcos/.ivy2/cache
The jars for the packages stored in: /home/marcos/.ivy2/jars
com.google.cloud.spark#spark-bigquery-with-dependencies_2.12 added as a dependency
com.google.cloud.bigdataoss#gcs-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bd66f191-3b8e-4c44-a828-b298b464faa6;1.0
	confs: [default]
	found com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.24.2 in central
	found com.google.cloud.bigdataoss#gcs-connector;hadoop3-2.2.0 in central
	found com.google.api-client#google-api-client-jackson2;1.31.1 in central
	found com.google.api-client#google-api-client;1.31.1 in central
	found com.google.oauth-client#google-oauth-client;1.31.2 in central
	found com.google.http-client#google-http-client;1.38.0 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.2 in central
	foun

22/04/12 21:38:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
# Use the Cloud Storage bucket for temporary BigQuery export data used
# by the connector.
bucket = "ghcnd_raw"
spark.conf.set('temporaryGcsBucket', bucket)

In [6]:
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
# This is required if you are using service account and set true, 
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile","/home/marcos/.google/credentials/google_credentials.json")

In [7]:
# Used only when developing with local spark master
def download_file(url, local_file_path):
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_file_path

In [8]:
def process_year(year, mode, df_stations, df_countries):

  """
  # For developing process read directly from origin
  csv_file_name = f'/{year}.csv'
  dataset_url = URL_PREFIX + '/csv' + csv_file_name
  csv_file_path = TEMP_STORAGE_PATH + csv_file_name

  download_file(dataset_url, csv_file_path)    

  schema = StructType([
      StructField("id", StringType(), True),
      StructField("date", IntegerType(), True),
      StructField("element", StringType(), True),   
      StructField("value", IntegerType(), True),   
      StructField("m_flag", StringType(), True),   
      StructField("q_flag", StringType(), True),   
      StructField("s_flag", StringType(), True),
      StructField("obs_time",IntegerType(), True)
  ])

  df = spark.read \
    .options(header=False)
    .schema(schema)
    .csv(csv_file_path)
  """


  # Option, read from BQ
  df = spark.read.format('bigquery') \
    .option('project','ghcn-d') \
    .option('dataset','ghcnd') \
    .option('table',f'{year}').load()


  # Option, read from GCS
  #df = spark.read.parquet(f'gs://ghcnd_raw/{year}.parquet')

  print(f'processing year {year}...')
  # Only used when reading from csv in order to convert to date. 
  # If reading from BQ, this is already done
  # df = df.withColumn("date", F.to_date(df.date.cast("string"), "yyyyMMdd"))

  df = df \
    .drop("q_flag") \
    .withColumn("tmax", 
          F.when(df.element == "TMAX", 
              F.when(df.value > 700, None).otherwise(
                  F.when(df.value < -700, None). otherwise(
                      df.value.cast("double")/10)
                  )
          ).otherwise("None")
      ) \
      .withColumn("tmin", 
          F.when(df.element == "TMIN", 
              F.when(df.value > 700, None).otherwise(
                  F.when(df.value < -700, None). otherwise(
                      df.value.cast("double")/10)
                  )
          ).otherwise("None")
      ) \
      .withColumn("prcp", F.when(df.element == "PRCP", df.value.cast("double")).otherwise(None)) \
      .withColumn("snow", F.when(df.element == "SNOW", df.value.cast("double")).otherwise(None)) \
      .withColumn("snwd", F.when(df.element == "SNWD", df.value.cast("double")).otherwise(None))

  df_daily = df \
      .groupBy("id", "date").agg( 
          F.avg("tmax"),
          F.avg("tmin"),
          F.avg("prcp"),
          F.avg("snow"),
          F.avg("snwd"),
          F.first("m_flag"),
          F.first("s_flag")
      ) \
      .join(df_stations, df.id == df_stations.station_id, "inner") \
      .join(df_countries, df_stations.country_code == df_countries.code, "inner") \
      .drop ('station_id', 'code') \
      .toDF('id','date','tmax','tmin','prcp','snow','snwd','m_flag','s_flag','latitude','longitude','elevation','station_name','country_code','country_name') 

  # Note: toDF after joins, otherwise join will raise error
  # Note: toDF since BQ does not allow field names with () and average generates these kind of names avg(tmax)

  df_yearly =  df \
    .withColumn("date", F.trunc("date", "year")) \
    .groupBy("id", "date").agg( 
      F.avg("tmax"),
      F.avg("tmin"),
      F.avg("prcp"),
      F.avg("snow"),
      F.avg("snwd"),
      F.first("m_flag"),
      F.first("s_flag")
    ) \
    .join(df_stations, df.id == df_stations.station_id, "inner") \
    .join(df_countries, df_stations.country_code == df_countries.code, "inner") \
    .drop ('station_id', 'code') \
    .toDF('id','date','tmax','tmin','prcp','snow','snwd','m_flag','s_flag','latitude','longitude','elevation','station_name','country_code','country_name') \

  # For some reason, partition by date does not work after F.year("date"). This has to be fixed
  # Also, partition is needed for clustering
  df_yearly.write \
    .format("bigquery") \
    .mode(mode) \
    .option("clusteredFields", "date, country_code") \
    .option('project','ghcn-d') \
    .option('dataset','production') \
    .option('table','fact_observations_spark_yearly') \
    .save()
    
  
  df_daily.write \
    .format("bigquery") \
    .mode(mode) \
    .option("partitionField", "date") \
    .option("partitionType", "YEAR") \
    .option("clusteredFields", "country_code") \
    .option('project','ghcn-d') \
    .option('dataset','production') \
    .option('table','fact_observations_spark') \
    .save()
  


  print(f'process {year} done')

In [9]:

# Use if needed to read from BigQuery instead of GCS
df_stations = spark.read.format('bigquery') \
  .option('project','ghcn-d') \
  .option('dataset','ghcnd') \
  .option('table', 'stations').load() \
  .drop('state', 'gsn_flag', 'hcn_crn_flag', 'wmo_id') \
  .withColumnRenamed('name', 'station_name') \
  .withColumnRenamed('id', 'station_id') \
  .withColumn('country_code', F.substring('station_id', 0, 2))

df_countries = spark.read.format('bigquery') \
  .option('project','ghcn-d') \
  .option('dataset','ghcnd') \
  .option('table', 'countries').load() \
  .withColumnRenamed('name', 'country_name')
  

In [10]:
df_countries.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----+--------------------+
|code|        country_name|
+----+--------------------+
|  FK|Falkland Islands ...|
|  JO|              Jordan|
|  PK|            Pakistan|
|  RQ|Puerto Rico [Unit...|
|  UK|      United Kingdom|
|  BK|Bosnia and Herzeg...|
|  LA|                Laos|
|  LQ|Palmyra Atoll [Un...|
|  PL|              Poland|
|  ST|         Saint Lucia|
|  SX|South Georgia and...|
|  VM|             Vietnam|
|  GV|              Guinea|
|  TE|Tromelin Island [...|
|  EK|   Equatorial Guinea|
|  GA|         Gambia, The|
|  IO|British Indian Oc...|
|  MF|    Mayotte [France]|
|  RP|         Philippines|
|  TS|             Tunisia|
+----+--------------------+
only showing top 20 rows



                                                                                

In [11]:
"""df_stations = spark.read.parquet('gs://ghcnd_raw/ghcnd-stations.parquet') \
  .drop('state', 'gsn_flag', 'hcn_crn_flag', 'wmo_id') \
  .withColumnRenamed('name', 'station_name') \
  .withColumnRenamed('id', 'station_id') \
  .withColumn('country_code', F.substring('station_id', 0, 2))

df_countries = spark.read.parquet('gs://ghcnd_raw/ghcnd-countries.parquet') \
  .withColumnRenamed('name', 'country_name')
"""

"df_stations = spark.read.parquet('gs://ghcnd_raw/ghcnd-stations.parquet')   .drop('state', 'gsn_flag', 'hcn_crn_flag', 'wmo_id')   .withColumnRenamed('name', 'station_name')   .withColumnRenamed('id', 'station_id')   .withColumn('country_code', F.substring('station_id', 0, 2))\n\ndf_countries = spark.read.parquet('gs://ghcnd_raw/ghcnd-countries.parquet')   .withColumnRenamed('name', 'country_name')\n"

In [12]:
for year in range(start_year, end_year+1):
  if year == start_year:
    process_year(year, 'overwrite', df_stations, df_countries)
  else:
    process_year(year, 'append', df_stations, df_countries)

processing year 2022...


Py4JJavaError: An error occurred while calling o206.save.
: java.lang.RuntimeException: java.lang.NoSuchMethodException: com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS.<init>()
	at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:137)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3467)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at com.google.cloud.spark.bigquery.SparkBigQueryUtil.getUniqueGcsPath(SparkBigQueryUtil.java:112)
	at com.google.cloud.spark.bigquery.SparkBigQueryUtil.createGcsPath(SparkBigQueryUtil.java:95)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.<init>(BigQueryWriteHelper.scala:44)
	at com.google.cloud.spark.bigquery.BigQueryInsertableRelation.insert(BigQueryInsertableRelation.scala:42)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:112)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.NoSuchMethodException: com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS.<init>()
	at java.base/java.lang.Class.getConstructor0(Class.java:3349)
	at java.base/java.lang.Class.getDeclaredConstructor(Class.java:2553)
	at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:131)
	... 52 more
