Install pyspark, and java for spurk to run on:

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=c99f54b306a5baa81484bade9dc6d2a8eb206009f4a1a48bccb30272769764a5
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

Initialize spark context:

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)
spark = SparkSession.builder.getOrCreate()

Load the dataset:

In [3]:
temperatures_data = spark.read.csv('/content/drive/MyDrive/spark_tutorials/spark_datasets/1800.csv', header=False)
temperatures_data.show()

+-----------+--------+----+----+----+----+---+----+
|        _c0|     _c1| _c2| _c3| _c4| _c5|_c6| _c7|
+-----------+--------+----+----+----+----+---+----+
|ITE00100554|18000101|TMAX| -75|NULL|NULL|  E|NULL|
|ITE00100554|18000101|TMIN|-148|NULL|NULL|  E|NULL|
|GM000010962|18000101|PRCP|   0|NULL|NULL|  E|NULL|
|EZE00100082|18000101|TMAX| -86|NULL|NULL|  E|NULL|
|EZE00100082|18000101|TMIN|-135|NULL|NULL|  E|NULL|
|ITE00100554|18000102|TMAX| -60|NULL|   I|  E|NULL|
|ITE00100554|18000102|TMIN|-125|NULL|NULL|  E|NULL|
|GM000010962|18000102|PRCP|   0|NULL|NULL|  E|NULL|
|EZE00100082|18000102|TMAX| -44|NULL|NULL|  E|NULL|
|EZE00100082|18000102|TMIN|-130|NULL|NULL|  E|NULL|
|ITE00100554|18000103|TMAX| -23|NULL|NULL|  E|NULL|
|ITE00100554|18000103|TMIN| -46|NULL|   I|  E|NULL|
|GM000010962|18000103|PRCP|   4|NULL|NULL|  E|NULL|
|EZE00100082|18000103|TMAX| -10|NULL|NULL|  E|NULL|
|EZE00100082|18000103|TMIN| -73|NULL|NULL|  E|NULL|
|ITE00100554|18000104|TMAX|   0|NULL|NULL|  E|NULL|
|ITE00100554

Manipulate the data:

In [4]:
d = temperatures_data.rdd.map( lambda x: (x[0], x[1], x[2], int(x[3])) )
d_min = d.filter( lambda x: x[2] == 'TMIN' )
d_max = d.filter( lambda x: x[2] == 'TMAX' )
d_min = d_min.map( lambda x: (x[0], (x[1], x[3])) )
d_max = d_max.map( lambda x: (x[0], (x[1], x[3])) )

min_temps = d_min.reduceByKey( lambda x, y: (x[0], min(x[1], y[1])) )
max_temps = d_max.reduceByKey( lambda x, y: (x[0], max(x[1], y[1])) )
min_temps = min_temps.collect()
max_temps = max_temps.collect()

In [5]:
print('Min temps by station:\n', min_temps)
print('Max temps by station:\n', max_temps)

Min temps by station:
 [('ITE00100554', ('18000101', -148)), ('EZE00100082', ('18000101', -135))]
Max temps by station:
 [('ITE00100554', ('18000101', 323)), ('EZE00100082', ('18000101', 323))]


Same thing but with Spark DataFrames:

In [18]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

schema = StructType([
    StructField('ID', StringType(), nullable=True),
    StructField('date', StringType(), nullable=True),
    StructField('measure_type', StringType(), nullable=True),
    StructField('temperature', FloatType(), nullable=True)
])

temperature_df = spark.read.schema(schema).csv('/content/drive/MyDrive/spark_tutorials/spark_datasets/1800.csv')
temperature_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- date: string (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)



In [23]:
temperature_df_min = temperature_df.filter((temperature_df.measure_type == 'TMIN'))
temperature_df_max = temperature_df.filter((temperature_df.measure_type == 'TMAX'))
min_temperatures = temperature_df_min.groupBy('ID').agg(F.min('temperature').alias('min_temperature'))
max_temperatures = temperature_df_max.groupBy('ID').agg(F.max('temperature').alias('max_temperature'))

min_temperatures = min_temperatures.withColumn('min_temperature', F.round(F.col('min_temperature')*0.1, 1))
max_temperatures = max_temperatures.withColumn('max_temperature', F.round(F.col('max_temperature')*0.1, 1))
min_temperatures.show()
max_temperatures.show()


+-----------+---------------+
|         ID|min_temperature|
+-----------+---------------+
|ITE00100554|          -14.8|
|EZE00100082|          -13.5|
+-----------+---------------+

+-----------+---------------+
|         ID|max_temperature|
+-----------+---------------+
|ITE00100554|           32.3|
|EZE00100082|           32.3|
+-----------+---------------+

