Pivot exercise

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("Dateformat and Pivot") \
    .master("local[*]") \
    .getOrCreate()

In [4]:
rdd = spark.sparkContext.parallelize([
    ("FATAL", "January", 94),
    ("WARN", "January", 8217),
    ("ERROR", "January", 4054),
    ("DEBUG", "January", 40856),
    ("INFO", "January", 18653),
    ("ERROR", "February", 4013),
    ("WARN", "February", 7543),
    ("DEBUG", "February", 36214),
    ("INFO", "February", 17563),
    ("FATAL", "February", 120),
    ("ERROR", "March", 4320),
    ("WARN", "March", 7890),
    ("DEBUG", "March", 39421),
    ("INFO", "March", 18347),
    ("FATAL", "March", 98),
    ("ERROR", "April", 3987),
    ("WARN", "April", 7201),
    ("DEBUG", "April", 38765),
    ("INFO", "April", 17258),
    ("FATAL", "April", 112),
    ("ERROR", "May", 4102),
    ("WARN", "May", 8005),
    ("DEBUG", "May", 40562),
    ("INFO", "May", 17893),
    ("FATAL", "May", 105),
    ("ERROR", "June", 3821),
    ("WARN", "June", 7124),
    ("DEBUG", "June", 39240),
    ("INFO", "June", 16942),
    ("FATAL", "June", 99),
    ("ERROR", "July", 4501),
    ("WARN", "July", 8450),
    ("DEBUG", "July", 41872),
    ("INFO", "July", 19027),
    ("FATAL", "July", 108),
    ("ERROR", "August", 4203),
    ("WARN", "August", 7893),
    ("DEBUG", "August", 39980),
    ("INFO", "August", 18594),
    ("FATAL", "August", 115),
    ("ERROR", "September", 3702),
    ("WARN", "September", 6992),
    ("DEBUG", "September", 38572),
    ("INFO", "September", 16780),
    ("FATAL", "September", 102),
    ("ERROR", "October", 4056),
    ("WARN", "October", 8024),
    ("DEBUG", "October", 40651),
    ("INFO", "October", 17678),
    ("FATAL", "October", 109),
    ("ERROR", "November", 3805),
    ("WARN", "November", 7121),
    ("DEBUG", "November", 39102),
    ("INFO", "November", 16832),
    ("FATAL", "November", 104),
    ("ERROR", "December", 4230),
    ("WARN", "December", 7850),
    ("DEBUG", "December", 40431),
    ("INFO", "December", 18001),
    ("FATAL", "December", 111)
])

In [5]:
# create a Schema
from pyspark.sql.types import *
from pyspark.sql.functions import *

schema = StructType([
    StructField("log_level",StringType(),True),
    StructField("month",StringType(),True),
    StructField("total_occurences",IntegerType(),True)
])

In [6]:
df = spark.createDataFrame(rdd, schema)

In [8]:
df.show(5)

+---------+-------+----------------+
|log_level|  month|total_occurences|
+---------+-------+----------------+
|    FATAL|January|              94|
|     WARN|January|            8217|
|    ERROR|January|            4054|
|    DEBUG|January|           40856|
|     INFO|January|           18653|
+---------+-------+----------------+
only showing top 5 rows



In [10]:
df.groupBy("log_level").count().show()

+---------+-----+
|log_level|count|
+---------+-----+
|     INFO|   12|
|    ERROR|   12|
|     WARN|   12|
|    FATAL|   12|
|    DEBUG|   12|
+---------+-----+



In [None]:
# Pivot from Jan -> Dec
df_pivot = df.groupBy("log_level").pivot("month").sum("total_occurences")
df_pivot.show()

+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|log_level|April|August|December|February|January| July| June|March|  May|November|October|September|
+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|     INFO|17258| 18594|   18001|   17563|  18653|19027|16942|18347|17893|   16832|  17678|    16780|
|    ERROR| 3987|  4203|    4230|    4013|   4054| 4501| 3821| 4320| 4102|    3805|   4056|     3702|
|     WARN| 7201|  7893|    7850|    7543|   8217| 8450| 7124| 7890| 8005|    7121|   8024|     6992|
|    FATAL|  112|   115|     111|     120|     94|  108|   99|   98|  105|     104|    109|      102|
|    DEBUG|38765| 39980|   40431|   36214|  40856|41872|39240|39421|40562|   39102|  40651|    38572|
+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



In [None]:
# Pivot with sum() and avg() function
df_pivot1= df.groupBy("log_level").pivot("month").agg(
    sum("total_occurences").alias("total"),
    avg("total_occurences").alias("average")
)
df_pivot1.show()

+---------+-----------+-------------+------------+--------------+--------------+----------------+--------------+----------------+-------------+---------------+----------+------------+----------+------------+-----------+-------------+---------+-----------+--------------+----------------+-------------+---------------+---------------+-----------------+
|log_level|April_total|April_average|August_total|August_average|December_total|December_average|February_total|February_average|January_total|January_average|July_total|July_average|June_total|June_average|March_total|March_average|May_total|May_average|November_total|November_average|October_total|October_average|September_total|September_average|
+---------+-----------+-------------+------------+--------------+--------------+----------------+--------------+----------------+-------------+---------------+----------+------------+----------+------------+-----------+-------------+---------+-----------+--------------+----------------+---------

In [None]:
spark.stop()