In [4]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")
print(lines.take(5))
sc.stop()

['ITE00100554,18000101,TMAX,-75,,,E,', 'ITE00100554,18000101,TMIN,-148,,,E,', 'GM000010962,18000101,PRCP,0,,,E,', 'EZE00100082,18000101,TMAX,-86,,,E,', 'EZE00100082,18000101,TMIN,-135,,,E,']


In [5]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

rdd = lines.map(parseLine)
print(rdd.take(5))
sc.stop()

[('ITE00100554', 'TMAX', 18.5), ('ITE00100554', 'TMIN', 5.359999999999999), ('GM000010962', 'PRCP', 32.0), ('EZE00100082', 'TMAX', 16.52), ('EZE00100082', 'TMIN', 7.699999999999999)]


Now we only want rows of 'TMIN' (min temperature). So we can use **filter()** function, which filters the rows using given condition. The new RDD will only contain rows where the condition is True. **Note that filter() function uses all 

In [1]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 # Convert temperature to F
    return (stationID, entryType, temperature)

rdd = lines.map(parseLine)
minTemps = rdd.filter(lambda x: x[1] == 'TMIN').map(lambda x: (x[0], x[2]))
print(minTemps.take(5))
sc.stop()

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999), ('ITE00100554', 9.5), ('EZE00100082', 8.599999999999998), ('ITE00100554', 23.72)]


Now we want to find the minimum temperatures for each location across all year. The way we do it is to use the **reduceByKey()** function. The value we will return is the min value of the previous and the current values. (Note that **reduceByKey()** takes two arguments: the first one is the cumulative term, i.e. what has seen so far, and the second one is the current term. It is applied across samples with the same index).

In [3]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 # Convert temperature to F
    return (stationID, entryType, temperature)

rdd = lines.map(parseLine)
minTemps = rdd.filter(lambda x: x[1] == 'TMIN').map(lambda x: (x[0], x[2]))
minTemp = minTemps.reduceByKey(lambda x, y: min(x, y))
print(minTemp.take(5))
sc.stop()

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]


Finally, we perform the action **collect()** to collect the result.

In [5]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 # Convert temperature to F
    return (stationID, entryType, temperature)

rdd = lines.map(parseLine)
minTemps = rdd.filter(lambda x: x[1] == 'TMIN').map(lambda x: (x[0], x[2]))
minTemp = minTemps.reduceByKey(lambda x, y: min(x, y))
results = minTemp.collect()

for result in results:
    print(result[0] + "\t{:.2f}".format(result[1]))
sc.stop()

ITE00100554	5.36
EZE00100082	7.70


Modify to show the maximum temperature found for each weather station:

In [7]:
import findspark

findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FindMinTemperature")
sc = SparkContext(conf=conf)
lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\1800.csv")

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 # Convert temperature to F
    return (stationID, entryType, temperature)

rdd = lines.map(parseLine)
maxTemps = rdd.filter(lambda x: x[1] == 'TMAX').map(lambda x: (x[0], x[2]))
maxTemp = maxTemps.reduceByKey(lambda x, y: max(x, y))
results = maxTemp.collect()

for result in results:
    print(result[0] + "\t{:.2f}".format(result[1]))
sc.stop()

ITE00100554	90.14
EZE00100082	90.14
