In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

schema = StructType([StructField("stationID", StringType(), True),
                    StructField("data", IntegerType(), True),
                    StructField("measure_type", StringType(), True),
                    StructField("temperature", FloatType(), True)])

df = spark.read.schema(schema).csv(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")
df.printSchema()

# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

# Select only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

# Aggregate to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

root
 |-- stationID: string (nullable = true)
 |-- data: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



Note that here we set a schema for the file we read. This is because the file is not a structured data type, and we need to manually set the headers for it.

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

schema = StructType([StructField("stationID", StringType(), True),
                    StructField("data", IntegerType(), True),
                    StructField("measure_type", StringType(), True),
                    StructField("temperature", FloatType(), True)])

df = spark.read.schema(schema).csv(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")
df.printSchema()

# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

# Select only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

# Aggregate to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")

# Convert temperature to F and sort the dataset
minTempsByStationF = minTempsByStation.withColumn("temperature", func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0)
                                                                           + 32.0, 2)).select("stationID", "temperature").sort("temperature")
minTempsByStationF.show()
spark.stop()

root
 |-- stationID: string (nullable = true)
 |-- data: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|       5.36|
|EZE00100082|        7.7|
+-----------+-----------+



**withColumn()** creates a new column named *temperature* with values in the second argument. The third argument means round the value to 2 decimal places.