Date Format exercise

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Date Format exercise") \
    .master("local[*]") \
    .getOrCreate()

In [10]:
# Define schema
from pyspark.sql.types import *
from pyspark.sql.functions import *
schema = StructType([
    StructField("log_level", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("total_occurences", StringType(), True)
])

In [4]:
import random
from datetime import datetime, timedelta
# Generate data sample
def random_timestamp():
    start_date = datetime(2015,1,1,0,0)
    random_days = random.randint(0,364)
    random_time = timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59), seconds=random.randint(0, 59))
    return start_date + timedelta(days=random_days) + random_time

log_levels = ["INFO","WARN", "ERROR", "DEBUG", "FATAL"]

data = [
    (random.choice(log_levels), random_timestamp(), random.randint(10, 50000))
    for _ in range(100)
]


In [9]:
print(data)

[('FATAL', datetime.datetime(2015, 9, 6, 19, 0, 48), 36235), ('ERROR', datetime.datetime(2015, 6, 29, 11, 52, 43), 31182), ('ERROR', datetime.datetime(2015, 3, 29, 23, 43, 6), 3662), ('INFO', datetime.datetime(2015, 10, 21, 13, 32, 37), 36606), ('DEBUG', datetime.datetime(2015, 8, 26, 7, 26, 54), 48107), ('FATAL', datetime.datetime(2015, 8, 10, 1, 32, 18), 20897), ('ERROR', datetime.datetime(2015, 3, 28, 22, 20, 19), 45155), ('DEBUG', datetime.datetime(2015, 7, 23, 21, 16, 12), 46721), ('ERROR', datetime.datetime(2015, 8, 5, 0, 47, 14), 24229), ('FATAL', datetime.datetime(2015, 4, 25, 21, 6, 30), 16353), ('INFO', datetime.datetime(2015, 7, 22, 17, 40, 3), 1110), ('WARN', datetime.datetime(2015, 8, 11, 9, 37, 12), 15852), ('DEBUG', datetime.datetime(2015, 9, 16, 11, 5, 12), 25951), ('ERROR', datetime.datetime(2015, 3, 5, 12, 3, 43), 19500), ('WARN', datetime.datetime(2015, 7, 2, 9, 59), 7524), ('FATAL', datetime.datetime(2015, 10, 23, 10, 24, 15), 41994), ('DEBUG', datetime.datetime(201

In [11]:
df = spark.createDataFrame(data,schema = schema)

In [None]:
# check the data sample which we already created
df.show(5)

+---------+-------------------+----------------+
|log_level|          timestamp|total_occurences|
+---------+-------------------+----------------+
|    FATAL|2015-09-06 19:00:48|           36235|
|    ERROR|2015-06-29 11:52:43|           31182|
|    ERROR|2015-03-29 23:43:06|            3662|
|     INFO|2015-10-21 13:32:37|           36606|
|    DEBUG|2015-08-26 07:26:54|           48107|
+---------+-------------------+----------------+
only showing top 5 rows



In [25]:
# Turning timestamp to Date_Format with "Month"
df1 = df.withColumn("month",
              date_format("timestamp", "MMMM"))
df1.show(5)

+---------+-------------------+----------------+---------+
|log_level|          timestamp|total_occurences|    month|
+---------+-------------------+----------------+---------+
|    FATAL|2015-09-06 19:00:48|           36235|September|
|    ERROR|2015-06-29 11:52:43|           31182|     June|
|    ERROR|2015-03-29 23:43:06|            3662|    March|
|     INFO|2015-10-21 13:32:37|           36606|  October|
|    DEBUG|2015-08-26 07:26:54|           48107|   August|
+---------+-------------------+----------------+---------+
only showing top 5 rows



In [27]:
df2 = df1.withColumn("date_of_occurence",to_date(df1['timestamp']))
df2.show(5)

+---------+-------------------+----------------+---------+-----------------+
|log_level|          timestamp|total_occurences|    month|date_of_occurence|
+---------+-------------------+----------------+---------+-----------------+
|    FATAL|2015-09-06 19:00:48|           36235|September|       2015-09-06|
|    ERROR|2015-06-29 11:52:43|           31182|     June|       2015-06-29|
|    ERROR|2015-03-29 23:43:06|            3662|    March|       2015-03-29|
|     INFO|2015-10-21 13:32:37|           36606|  October|       2015-10-21|
|    DEBUG|2015-08-26 07:26:54|           48107|   August|       2015-08-26|
+---------+-------------------+----------------+---------+-----------------+
only showing top 5 rows



In [31]:
df.createOrReplaceTempView("serverlogs")

In [None]:
# Check serverlogs dataset in SQL
spark.sql("""
    select * from serverlogs limit 10
          """).show()

+---------+-------------------+----------------+
|log_level|          timestamp|total_occurences|
+---------+-------------------+----------------+
|    FATAL|2015-09-06 19:00:48|           36235|
|    ERROR|2015-06-29 11:52:43|           31182|
|    ERROR|2015-03-29 23:43:06|            3662|
|     INFO|2015-10-21 13:32:37|           36606|
|    DEBUG|2015-08-26 07:26:54|           48107|
|    FATAL|2015-08-10 01:32:18|           20897|
|    ERROR|2015-03-28 22:20:19|           45155|
|    DEBUG|2015-07-23 21:16:12|           46721|
|    ERROR|2015-08-05 00:47:14|           24229|
|    FATAL|2015-04-25 21:06:30|           16353|
+---------+-------------------+----------------+



In [41]:
# Update data type by SQL query
spark.sql("""
    SELECT log_level,
          date_format(timestamp, 'MMMM') as month_name,
          count(*) as total_count
    FROM serverlogs
    GROUP BY log_level, month_name
    ORDER BY month_name
          """).show()

#Note: if the timestamp is 'StringType': 
# -> date_format(to_date(timestamp, 'yyyy-MM-dd  HH:mm:ssss'), 'MMMM') as month_name 

+---------+----------+-----------+
|log_level|month_name|total_count|
+---------+----------+-----------+
|    FATAL|     April|          2|
|     INFO|     April|          2|
|     WARN|     April|          1|
|    FATAL|    August|          1|
|    ERROR|    August|          3|
|    DEBUG|    August|          3|
|     WARN|    August|          5|
|     INFO|    August|          1|
|    DEBUG|  December|          3|
|     INFO|  December|          4|
|     WARN|  December|          1|
|     INFO|  February|          2|
|    FATAL|  February|          3|
|    ERROR|  February|          2|
|     WARN|   January|          1|
|    DEBUG|   January|          3|
|    FATAL|   January|          1|
|    ERROR|   January|          1|
|    DEBUG|      July|          1|
|     INFO|      July|          3|
+---------+----------+-----------+
only showing top 20 rows



In [44]:
# But the query above didn't return by month( it returns by alphabet)
# Let's make an update!
df_generate = spark.sql("""
    SELECT log_level,
          date_format(timestamp, 'MMMM') as month_name,
          int(date_format(timestamp, 'M')) as month_num,
          count(*) as total_count
    FROM serverlogs
    GROUP BY log_level, month_name, month_num
    ORDER BY month_num
          """)

In [45]:
df_final = df_generate.drop("month_num")

In [48]:
# Clean unnecessary column with month_num
# Because it already make the date column in ascending order
df_final.show(10)

+---------+----------+-----------+
|log_level|month_name|total_count|
+---------+----------+-----------+
|    DEBUG|   January|          3|
|     WARN|   January|          1|
|    ERROR|   January|          1|
|    FATAL|   January|          1|
|     INFO|  February|          2|
|    FATAL|  February|          3|
|    ERROR|  February|          2|
|    ERROR|     March|          5|
|     WARN|     March|          1|
|     INFO|     March|          2|
+---------+----------+-----------+
only showing top 10 rows



In [68]:
# Pivot
df = df.withColumn("total_occurences", col("total_occurences").cast("int"))

df.createOrReplaceTempView("serverlogs")

df_pi = spark.sql("""
    SELECT log_level,
          date_format(timestamp,"MM") as month_name,
            total_occurences
    FROM serverlogs
          """).groupBy("log_level").pivot("month_name").sum("total_occurences").show()

+---------+-----+-----+------+-----+-----+-----+-----+------+-----+------+-----+------+
|log_level|   01|   02|    03|   04|   05|   06|   07|    08|   09|    10|   11|    12|
+---------+-----+-----+------+-----+-----+-----+-----+------+-----+------+-----+------+
|     INFO| NULL|54519| 65186|30002|78348|70776|61512|  6117| NULL|110406| 5460|126554|
|    ERROR|30175|39794|124584| NULL|42648|96768| NULL| 71482|16267| 37207| 6545|  NULL|
|     WARN|33397| NULL|  1714|38548|41386|27087|56070| 89878|46162| 42994| NULL| 30106|
|    DEBUG|90563| NULL| 29532| NULL|23943|26255|46721|128415|54331| 11717|94095| 52883|
|    FATAL|45537|65493|  NULL|63988| NULL|90213|30799| 20897|84812| 55267|46475|  NULL|
+---------+-----+-----+------+-----+-----+-----+-----+------+-----+------+-----+------+



In [69]:
# update the list name
month_list = ["January","February","March","April","May","June","July","August","September","October","November","December"]

In [None]:
# Insert the month_list into pivot name appearance
df = df.withColumn("total_occurences", col("total_occurences").cast("int"))

df.createOrReplaceTempView("serverlogs")

# update the list name
month_list = ["January","February","March","April","May","June","July","August","September","October","November","December"]

df_pi = spark.sql("""
    SELECT log_level,
          date_format(timestamp,"MMMM") as month_name,
            total_occurences
    FROM serverlogs
          """).groupBy("log_level").pivot("month_name",month_list).sum("total_occurences").fillna(0).show()

# Note: using fillna(0) instead of isnull()
# Because ISNULL() only when applying conditional logic (e.g: filtering rows where NULL exists)
# withColumn("January", when(col("January").isNull(),0).otherwise(col("January")))

+---------+-------+--------+------+-----+-----+-----+-----+------+---------+-------+--------+--------+
|log_level|January|February| March|April|  May| June| July|August|September|October|November|December|
+---------+-------+--------+------+-----+-----+-----+-----+------+---------+-------+--------+--------+
|     INFO|      0|   54519| 65186|30002|78348|70776|61512|  6117|        0| 110406|    5460|  126554|
|    ERROR|  30175|   39794|124584|    0|42648|96768|    0| 71482|    16267|  37207|    6545|       0|
|     WARN|  33397|       0|  1714|38548|41386|27087|56070| 89878|    46162|  42994|       0|   30106|
|    DEBUG|  90563|       0| 29532|    0|23943|26255|46721|128415|    54331|  11717|   94095|   52883|
|    FATAL|  45537|   65493|     0|63988|    0|90213|30799| 20897|    84812|  55267|   46475|       0|
+---------+-------+--------+------+-----+-----+-----+-----+------+---------+-------+--------+--------+



In [72]:
spark.sql("""
SELECT * from serverlogs limit 10
          """).show()

+---------+-------------------+----------------+
|log_level|          timestamp|total_occurences|
+---------+-------------------+----------------+
|    FATAL|2015-09-06 19:00:48|           36235|
|    ERROR|2015-06-29 11:52:43|           31182|
|    ERROR|2015-03-29 23:43:06|            3662|
|     INFO|2015-10-21 13:32:37|           36606|
|    DEBUG|2015-08-26 07:26:54|           48107|
|    FATAL|2015-08-10 01:32:18|           20897|
|    ERROR|2015-03-28 22:20:19|           45155|
|    DEBUG|2015-07-23 21:16:12|           46721|
|    ERROR|2015-08-05 00:47:14|           24229|
|    FATAL|2015-04-25 21:06:30|           16353|
+---------+-------------------+----------------+



In [None]:
# Shutdown spark
spark.stop()