In [184]:
from datetime import datetime

from pyspark import Row
from pyspark.sql import SparkSession, Window, functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType

In [67]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("data_exploration")
         .config("spark.driver.memory", "2g")
         .getOrCreate())

sc = spark.sparkContext

In [68]:
sc.uiWebUrl

'http://192.168.0.16:4040'

To start the exploration, we use a few month worth of data from 2003 that know to contain a heatwave from the assignment description.

Each file has several lines of headers that we need to strip.



In [69]:
def parse_float(string_to_parse: str):
    string = string_to_parse.rstrip()
    if string == "":
        return None
    else:
        return float(string_to_parse)


def parse_timestamp(string_to_parse: str):
    string = string_to_parse.rstrip()
    return datetime.strptime(string, "%Y-%m-%d %H:%M:%S")


def parse_row(row: str):
    return Row(
        timestamp=parse_timestamp(row[:21]),
        location=row[21:41].rstrip(),
        location_name=row[41:89].rstrip(),
        latitude=parse_float(row[89:109]),
        longitude=parse_float(row[109:129]),
        altitude=parse_float(row[129:149]),
        u_bool_10=parse_float(row[149:169]),
        t_dryb_10=parse_float(row[169:189]),
        tn_10cm_past_6h_10=parse_float(row[189:209]),
        t_dewp_10=parse_float(row[209:229]),
        t_dewp_sea_10=parse_float(row[229:249]),
        t_dryb_sea_10=parse_float(row[249:269]),
        tn_dryb_10=parse_float(row[269:289]),
        t_wetb_10=parse_float(row[289:309]),
        tx_dryb_10=parse_float(row[309:329]),
        u_10=parse_float(row[329:349]),
        u_sea_10=parse_float(row[349:]),
    )

In [70]:
parse_input_rdd = (sc
                   .textFile("../data/kis_tot_20030*", 3)
                   .map(lambda x: x.split(",")[0])
                   .filter(lambda x: not x.startswith("#"))
                   .map(lambda x: parse_row(x)))

Create a schema based on the files' schema.

| Field                | Description                                                         |
|----------------------|---------------------------------------------------------------------|
| `DTG`                | date of measurement                                                 |
| `LOCATION`           | location of the meteorological station                              |
| `NAME`               | name of the meteorological station                                  |
| `LATITUDE`           | in degrees (WGS84)                                                  |
| `LONGITUDE`          | in degrees (WGS84)                                                  |
| `ALTITUDE`           | in 0.1 m relative to Mean Sea Level (MSL)                           |
| `U_BOOL_10`          | air humidity code boolean 10' unit                                  |
| `T_DRYB_10`          | air temperature 10' unit Celcius degrees                            |
| `TN_10CM_PAST_6H_10` | air temperature minimum 0.1m 10' unit Celcius degrees               |
| `T_DEWP_10`          | air temperature derived dewpoint - 10' unit Celcius degrees         |
| `T_DEWP_SEA_10`      | air temperature derived dewpoint- sea 10' unit Celcius degrees      |
| `T_DRYB_SEA_10`      | air temperature height oil platform 10 minutes unit Celcius degrees |
| `TN_DRYB_10`         | air temperature minimum 10' unit Celcius degrees                    |
| `T_WETB_10`          | air temperature derived wet bulb- 10' unit Celcius degrees          |
| `TX_DRYB_10`         | air temperature maximum 10' unit Celcius degrees                    |
| `U_10`               | relative air humidity 10' unit %                                    |
| `U_SEA_10`           | is relative sea air humidity 10' unit %                             |

In [71]:
schema = StructType([
    StructField("timestamp", TimestampType(), False),
    StructField("location", StringType(), True),
    StructField("location_name", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("altitude", DoubleType(), True),
    StructField("u_bool_10", DoubleType(), True),
    StructField("t_dryb_10", DoubleType(), True),
    StructField("tn_10cm_past_6h_10", DoubleType(), True),
    StructField("t_dewp_10", DoubleType(), True),
    StructField("t_dewp_sea_10", DoubleType(), True),
    StructField("t_dryb_sea_10", DoubleType(), True),
    StructField("tn_dryb_10", DoubleType(), True),
    StructField("t_wetb_10", DoubleType(), True),
    StructField("tx_dryb_10", DoubleType(), True),
    StructField("u_10", DoubleType(), True),
    StructField("u_sea_10", DoubleType(), True),
])

In [72]:
df = (spark
      .createDataFrame(parse_input_rdd, schema=schema)
      .where(f.col("location") == "260_T_a")
      .select("timestamp",
              "location_name",
              f.col("T_DRYB_10").alias("temperature"),
              f.col("TN_DRYB_10").alias("temperature_minimum"),
              f.col("TX_DRYB_10").alias("temperature_maximum"),
              )
      )

In [163]:
max_df = df.groupBy(f.to_date("timestamp").alias("date")).agg(
    f.max(f.coalesce("temperature_maximum", "temperature")).alias("temperature"))

In [76]:
max_df.count()

                                                                                

184

In [174]:
max_df.cache()

DataFrame[location_name: string, date: date, temperature: double]

In [82]:
filtered_df = max_df.where(f.col("temperature") >= 25).orderBy("date")

In [168]:
# There's nothing to partition on in this case. All data will be moved to a single partition and be processed on a single worker.
# This only works as long as the dataframe is sufficiently small (which it is at this point).
window_spec = Window.orderBy("date")

is_start_of_sequence = f.col("number_of_days_to_preceding_rows_date") > 1

window_df = (max_df
             # Only keep days which might be part of a heat wave.
             .where(f.col("temperature") >= 25)
             # Ordering by date, calculate the date difference between each row and its preceding row.
             .withColumn("number_of_days_to_preceding_rows_date", f.datediff("date", f.lag("date").over(window_spec)))
             # If the number of days to the preceding row is greater than 1 in a row, the date is the start of a new sequence.
             .withColumn("sequence_start_date", f.when(is_start_of_sequence, f.col("date")))
             # For all other rows, find the last non-null value (going upwards in the column/backwards in time).
             .withColumn("sequence_start_date",
                         f.last(f.col("sequence_start_date"), ignorenulls=True).over(window_spec))
             )

include_partial_sequence_at_beginning_of_date_range = False

# Only required for bootstrapping. When processing a new batch of data for month n + 1 together with month n to recalculate potential heat waves lasting from month n to month + 1, we don't want to include partial data at the beginning of the month.
# if include_partial_sequence_at_beginning_of_daterange:
#TODO set start date where number_of_days_to_preceding_rows_date is null
# else:
#TODO filter rows where sequence_start_date is null

window_df = window_df.where(f.col("sequence_start_date").isNotNull())

In [169]:
window_df.show(200)

23/07/15 22:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:18:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+-------------+----------+-----------+-------------------------------------+-------------------+
|location_name|      date|temperature|number_of_days_to_preceding_rows_date|sequence_start_date|
+-------------+----------+-----------+-------------------------------------+-------------------+
|      De Bilt|2003-06-04|       28.9|                                    2|         2003-06-04|
|      De Bilt|2003-06-06|       25.0|                                    2|         2003-06-06|
|      De Bilt|2003-06-07|       26.5|                                    1|         2003-06-06|
|      De Bilt|2003-06-08|       25.4|                                    1|         2003-06-06|
|      De Bilt|2003-06-10|       25.4|                                    2|         2003-06-10|
|      De Bilt|2003-06-16|       25.0|                                    6|         2003-06-16|
|      De Bilt|2003-06-17|       25.4|                                    1|         2003-06-16|
|      De Bilt|2003-06-22|    

23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:19:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 2

In [188]:
is_tropical_day = f.col("temperature") >= 30.0

result_df = (
    window_df
    .withColumn("tropical_day", f.when(is_tropical_day, 1))
    .groupBy(f.col("sequence_start_date").alias("From date"))
    .agg(
        f.count("*").alias("Duration (in days)"),
        f.sum("tropical_day").alias("Number of tropical days"),
        f.max("temperature").alias("Max temperature")
    )
    .withColumn("To date (inc)", f.date_add("From date", f.col("Duration (in days)").cast(IntegerType())))
    .where((f.col("Duration (in days)") >= 5) & (f.col("Number of tropical days") >= 3))
    .select("From date", "To date (inc)", "Duration (in days)", "Number of tropical days", "Max temperature")
)

result_df.show()

23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-------------+------------------+-----------------------+---------------+
| From date|To date (inc)|Duration (in days)|Number of tropical days|Max temperature|
+----------+-------------+------------------+-----------------------+---------------+
|2003-07-31|   2003-08-14|                14|                      7|           35.0|
+----------+-------------+------------------+-----------------------+---------------+



23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/07/15 22:47:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
