In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [2]:
spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

In [3]:
schema = StructType([ \
                     StructField("stationID", StringType(), True), \
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True)])

// Read the file as dataframe

In [4]:
df = spark.read.schema(schema).csv("1800.csv")
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)



Filter out all but TMIN entries

In [5]:
minTemps = df.filter(df.measure_type == "TMIN")

Select only stationID and temperature

In [6]:
stationTemps = minTemps.select("stationID", "temperature")

Aggregate to find minimum temperature for every station

In [7]:
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



Convert temperature to fahrenheit and sort the dataset

In [8]:
minTempsByStationF = minTempsByStation.withColumn("temperature",
                                                  func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\
                                                  .select("stationID", "temperature").sort("temperature")
                                                  
# Collect, format, and print the results
results = minTempsByStationF.collect()