In [97]:
from datetime import datetime

from pyspark import Row
from pyspark.sql import SparkSession, Window, functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType

In [98]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("data_exploration")
         .config("spark.driver.memory", "2g")
         .getOrCreate())

sc = spark.sparkContext

In [99]:
sc.uiWebUrl

To start the exploration, we use a few month worth of data from 2003 that know to contain a heatwave from the assignment description.

Each file has several lines of headers that we need to strip (starting with "#").



In [100]:
parsed_input_rdd = (sc
                   .textFile("../data/kis_tot_200304", 2)
                   .map(lambda x: x.split(",")[0])
                   .filter(lambda x: not x.startswith("#")))

In [101]:
parsed_input_rdd.take(5)

The file appears to be in a fixed width format. The columns can be parsed by extracting partial strings using the column width given by a text editor. This is brittle and will only work if all files have the same column widths.

The fields in the data that we need to parse are:

| Field                | Description                                                         |
|----------------------|---------------------------------------------------------------------|
| `DTG`                | date of measurement                                                 |
| `LOCATION`           | location of the meteorological station                              |
| `NAME`               | name of the meteorological station                                  |
| `LATITUDE`           | in degrees (WGS84)                                                  |
| `LONGITUDE`          | in degrees (WGS84)                                                  |
| `ALTITUDE`           | in 0.1 m relative to Mean Sea Level (MSL)                           |
| `U_BOOL_10`          | air humidity code boolean 10' unit                                  |
| `T_DRYB_10`          | air temperature 10' unit Celcius degrees                            |
| `TN_10CM_PAST_6H_10` | air temperature minimum 0.1m 10' unit Celcius degrees               |
| `T_DEWP_10`          | air temperature derived dewpoint - 10' unit Celcius degrees         |
| `T_DEWP_SEA_10`      | air temperature derived dewpoint- sea 10' unit Celcius degrees      |
| `T_DRYB_SEA_10`      | air temperature height oil platform 10 minutes unit Celcius degrees |
| `TN_DRYB_10`         | air temperature minimum 10' unit Celcius degrees                    |
| `T_WETB_10`          | air temperature derived wet bulb- 10' unit Celcius degrees          |
| `TX_DRYB_10`         | air temperature maximum 10' unit Celcius degrees                    |
| `U_10`               | relative air humidity 10' unit %                                    |
| `U_SEA_10`           | is relative sea air humidity 10' unit %                             |

In [102]:
def parse_float(string_to_parse: str):
    string = string_to_parse.rstrip()
    if string == "":
        return None
    else:
        return float(string_to_parse)


def parse_datetime(string_to_parse: str):
    string = string_to_parse.rstrip()
    return datetime.strptime(string, "%Y-%m-%d %H:%M:%S")


def parse_row(row: str):
    return Row(
        dtg=parse_datetime(row[:21]),
        location=row[21:41].rstrip(),
        name=row[41:89].rstrip(),
        latitude=parse_float(row[89:109]),
        longitude=parse_float(row[109:129]),
        altitude=parse_float(row[129:149]),
        u_bool_10=parse_float(row[149:169]),
        t_dryb_10=parse_float(row[169:189]),
        tn_10cm_past_6h_10=parse_float(row[189:209]),
        t_dewp_10=parse_float(row[209:229]),
        t_dewp_sea_10=parse_float(row[229:249]),
        t_dryb_sea_10=parse_float(row[249:269]),
        tn_dryb_10=parse_float(row[269:289]),
        t_wetb_10=parse_float(row[289:309]),
        tx_dryb_10=parse_float(row[309:329]),
        u_10=parse_float(row[329:349]),
        u_sea_10=parse_float(row[349:]),
    )

parsed_input_rdd = parsed_input_rdd.map(lambda x: parse_row(x))

In [103]:
parsed_input_rdd.take(5)

As we are only interested in data for the weather station `De Bilt` indicated by the Location `260_T_a`, we filter other records out.

In [104]:
parsed_input_rdd = parsed_input_rdd.filter(lambda row: row.location == "260_T_a")

In [105]:
parsed_input_rdd.take(5)

For calculating heat and cold waves, we only care about the fields:
1. datetime (DTG)
2. temperature (T_DRYB_10)
3. minimum temperature (TN_DRYB_10)
4. maximum temperature (TX_DRYB_10)

We can disregard all other fields for further exploration.

In [106]:
parsed_input_rdd = (
    parsed_input_rdd
    .map(lambda row: Row(dt=row.dtg, temperature=row.t_dryb_10, min_temperature=row.tn_dryb_10, max_temperature=row.tx_dryb_10))
)

In [107]:
parsed_input_rdd.take(5)

To make our live a bit easier for a further dive into the data, we create a DataFrame from the RDD to access functions at a higher level of abstraction.

In [108]:
schema = (
    StructType()
    .add("dt", TimestampType())
    .add("temperature", DoubleType())
    .add("min_temperature", DoubleType())
    .add("max_temperature", DoubleType())
)

df = (
    spark
    .createDataFrame(parsed_input_rdd, schema=schema)
)

In [109]:
df.show(5)

We aggregate the data to identify missing values, and identify potential outliers.

In [110]:
stats_df = (
    df
    # .groupBy(
    #     f.year("dt").alias("year")
    # )
    .agg(
        f.count("*").alias("num_rows"),
        f.count("dt").alias("num_dates"),
        f.count("temperature").alias("num_temps"),
        f.min("temperature").alias("min_temp"),
        f.max("temperature").alias("max_temp"),
        f.count("min_temperature").alias("num_min_temps"),
        f.min("min_temperature").alias("min_min_temp"),
        f.max("min_temperature").alias("max_min_temp"),
        f.count("max_temperature").alias("num_max_temps"),
        f.min("max_temperature").alias("min_max_temp"),
        f.max("max_temperature").alias("max_max_temp"),
    )
    # .orderBy("year")
)

In [111]:
stats_df.show()