In [151]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [153]:
logs_data = [("DEBUG","2014-6-22 21:30:49"),
("WARN","2013-12-6 17:54:15"),
("DEBUG","2017-1-12 10:47:02"),
("DEBUG","2016-6-25 11:06:42"),
("ERROR","2015-6-28 19:25:05"),
("DEBUG","2012-6-24 01:06:37"),
("INFO","2014-12-9 09:53:54"),
("DEBUG","2015-11-8 19:20:08"),
("INFO","2017-12-21 18:34:18")]

In [154]:
log_df = spark.createDataFrame(logs_data).toDF("loglevel","logtime")

In [155]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG| 2014-6-22 21:30:49|
|    WARN| 2013-12-6 17:54:15|
|   DEBUG| 2017-1-12 10:47:02|
|   DEBUG| 2016-6-25 11:06:42|
|   ERROR| 2015-6-28 19:25:05|
|   DEBUG| 2012-6-24 01:06:37|
|    INFO| 2014-12-9 09:53:54|
|   DEBUG| 2015-11-8 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [156]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [157]:
from pyspark.sql.functions import *

In [158]:
new_logdf = log_df.withColumn("logtime",to_timestamp("logtime"))

In [159]:
new_logdf

loglevel,logtime
DEBUG,2014-06-22 21:30:49
WARN,2013-12-06 17:54:15
DEBUG,2017-01-12 10:47:02
DEBUG,2016-06-25 11:06:42
ERROR,2015-06-28 19:25:05
DEBUG,2012-06-24 01:06:37
INFO,2014-12-09 09:53:54
DEBUG,2015-11-08 19:20:08
INFO,2017-12-21 18:34:18


In [160]:
new_logdf.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG|2014-06-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [161]:
new_logdf.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [162]:
new_logdf.createOrReplaceTempView("serverlogs")


In [163]:
spark.sql("select * from serverlogs")

loglevel,logtime
DEBUG,2014-06-22 21:30:49
WARN,2013-12-06 17:54:15
DEBUG,2017-01-12 10:47:02
DEBUG,2016-06-25 11:06:42
ERROR,2015-06-28 19:25:05
DEBUG,2012-06-24 01:06:37
INFO,2014-12-09 09:53:54
DEBUG,2015-11-08 19:20:08
INFO,2017-12-21 18:34:18


In [164]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|   DEBUG|    June|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|December|
+--------+--------+



In [165]:
spark.sql("select loglevel, date_format(logtime, 'MM') as month from serverlogs").show()




+--------+-----+
|loglevel|month|
+--------+-----+
|   DEBUG|   06|
|    WARN|   12|
|   DEBUG|   01|
|   DEBUG|   06|
|   ERROR|   06|
|   DEBUG|   06|
|    INFO|   12|
|   DEBUG|   11|
|    INFO|   12|
+--------+-----+



In [166]:
spark.sql("select loglevel, date_format(logtime, 'dd') as day from serverlogs").show()
spark.sql("select loglevel, date_format(logtime, 'yyyy') as month from serverlogs").show()

+--------+---+
|loglevel|day|
+--------+---+
|   DEBUG| 22|
|    WARN| 06|
|   DEBUG| 12|
|   DEBUG| 25|
|   ERROR| 28|
|   DEBUG| 24|
|    INFO| 09|
|   DEBUG| 08|
|    INFO| 21|
+--------+---+

+--------+-----+
|loglevel|month|
+--------+-----+
|   DEBUG| 2014|
|    WARN| 2013|
|   DEBUG| 2017|
|   DEBUG| 2016|
|   ERROR| 2015|
|   DEBUG| 2012|
|    INFO| 2014|
|   DEBUG| 2015|
|    INFO| 2017|
+--------+-----+



In [167]:
logschema = "loglevel string, logtime timestamp"

In [168]:
log_df = spark.read \
.format("csv") \
.schema(logschema) \
.load("/public/trendytech/datasets/logdata1m.csv")

In [169]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [170]:
log_df.count()

1000000

In [171]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [172]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [173]:
log_df.createOrReplaceTempView("serverlogs")

In [174]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [175]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, count(*) as total_occurence from serverlogs group by loglevel, month").show()

+--------+---------+---------------+
|loglevel|    month|total_occurence|
+--------+---------+---------------+
|    WARN|     June|           8191|
|    INFO|     June|          29143|
|   ERROR| November|           3389|
|   FATAL|  January|             94|
|    WARN| December|           8328|
|    WARN|    March|           8165|
|   DEBUG|     July|          42085|
|   ERROR|    April|           4107|
|   ERROR|  January|           4054|
|   FATAL|September|             81|
|   FATAL|    April|             83|
|    INFO|September|          29038|
|   FATAL| November|          16797|
|   FATAL|  October|             92|
|    INFO| February|          28983|
|    WARN|    April|           8277|
|   DEBUG| December|          41749|
|   FATAL| December|             94|
|    WARN|      May|           8403|
|   ERROR|     June|           4059|
+--------+---------+---------------+
only showing top 20 rows



In [176]:
spark.sql("""select loglevel, date_format(logtime, 'yyyy') as year, 
count(*) as total_occurence from serverlogs group by loglevel, year""").show()

+--------+----+---------------+
|loglevel|year|total_occurence|
+--------+----+---------------+
|    WARN|2012|          16374|
|   ERROR|2013|           7968|
|   DEBUG|2014|          82386|
|   FATAL|2017|           2924|
|   FATAL|2012|           2925|
|    INFO|2012|          56964|
|   FATAL|2013|           2991|
|   DEBUG|2012|          81914|
|   ERROR|2012|           7860|
|   ERROR|2014|           8095|
|    WARN|2014|          16267|
|   DEBUG|2016|          82581|
|   FATAL|2014|           2920|
|    INFO|2013|          57206|
|    INFO|2017|          56805|
|   ERROR|2015|           8095|
|    INFO|2016|          57254|
|    WARN|2013|          16098|
|    INFO|2015|          57494|
|   DEBUG|2017|          81858|
+--------+----+---------------+
only showing top 20 rows



In [177]:
spark.sql("select loglevel, date_format(logtime, 'dd') as day, count(*) as total_occurence from serverlogs group by loglevel, day").show()

+--------+---+---------------+
|loglevel|day|total_occurence|
+--------+---+---------------+
|   ERROR| 18|           1694|
|   DEBUG| 15|          17587|
|   FATAL| 24|            610|
|    INFO| 15|          12218|
|   FATAL| 19|            615|
|   ERROR| 06|           1747|
|   ERROR| 19|           1694|
|   DEBUG| 14|          17841|
|    INFO| 02|          12477|
|    WARN| 15|           3551|
|   ERROR| 13|           1699|
|   ERROR| 09|           1700|
|   DEBUG| 03|          17587|
|    WARN| 14|           3456|
|   ERROR| 21|           1770|
|   FATAL| 22|            636|
|    INFO| 12|          12372|
|   ERROR| 17|           1753|
|   ERROR| 24|           1685|
|   DEBUG| 01|          17483|
+--------+---+---------------+
only showing top 20 rows



In [178]:
spark.sql("""select loglevel, date_format(logtime, 'MMMM') as month, int(date_format(logtime, 'M')) as Month_number,
count(*) as total_occurence from serverlogs group by loglevel, month, Month_number order by Month_number""").show()

+--------+--------+------------+---------------+
|loglevel|   month|Month_number|total_occurence|
+--------+--------+------------+---------------+
|    WARN| January|           1|           8217|
|   DEBUG| January|           1|          41961|
|   FATAL| January|           1|             94|
|    INFO| January|           1|          29119|
|   ERROR| January|           1|           4054|
|    INFO|February|           2|          28983|
|   DEBUG|February|           2|          41734|
|    WARN|February|           2|           8266|
|   ERROR|February|           2|           4013|
|   FATAL|February|           2|             72|
|    INFO|   March|           3|          29095|
|   FATAL|   March|           3|             70|
|   DEBUG|   March|           3|          41652|
|    WARN|   March|           3|           8165|
|   ERROR|   March|           3|           4122|
|   DEBUG|   April|           4|          41869|
|   FATAL|   April|           4|             83|
|    INFO|   April| 

In [179]:
result_df = spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, first(date_format(logtime, 'MM')) as month_num, count(*) as total_occurences from serverlogs group by loglevel,month order by month_num")

In [180]:
result_df.show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|   DEBUG| January|       01|           41961|
|   FATAL| January|       01|              94|
|    INFO| January|       01|           29119|
|   ERROR| January|       01|            4054|
|    WARN| January|       01|            8217|
|    WARN|February|       02|            8266|
|   ERROR|February|       02|            4013|
|   DEBUG|February|       02|           41734|
|   FATAL|February|       02|              72|
|    INFO|February|       02|           28983|
|   ERROR|   March|       03|            4122|
|    WARN|   March|       03|            8165|
|    INFO|   March|       03|           29095|
|   DEBUG|   March|       03|           41652|
|   FATAL|   March|       03|              70|
|   ERROR|   April|       04|            4107|
|    WARN|   April|       04|            8277|
|   FATAL|   April|       04|              83|
|    INFO|   

In [181]:
final_df = result_df.drop('month_num')

In [182]:
final_df.show()

+--------+--------+----------------+
|loglevel|   month|total_occurences|
+--------+--------+----------------+
|   FATAL| January|              94|
|    INFO| January|           29119|
|   ERROR| January|            4054|
|   DEBUG| January|           41961|
|    WARN| January|            8217|
|   FATAL|February|              72|
|    WARN|February|            8266|
|   ERROR|February|            4013|
|   DEBUG|February|           41734|
|    INFO|February|           28983|
|   FATAL|   March|              70|
|    WARN|   March|            8165|
|   DEBUG|   March|           41652|
|   ERROR|   March|            4122|
|    INFO|   March|           29095|
|   ERROR|   April|            4107|
|    WARN|   April|            8277|
|    INFO|   April|           29302|
|   DEBUG|   April|           41869|
|   FATAL|   April|              83|
+--------+--------+----------------+
only showing top 20 rows



In [183]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [184]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").show()


+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [185]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").groupBy('loglevel').pivot('Month').count().show()

+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|loglevel|April|August|December|February|January| July| June|March|  May|November|October|September|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|    INFO|29302| 28993|   28874|   28983|  29119|29300|29143|29095|28900|   23301|  29018|    29038|
|   ERROR| 4107|  3987|    4106|    4013|   4054| 3976| 4059| 4122| 4086|    3389|   4040|     4161|
|    WARN| 8277|  8381|    8328|    8266|   8217| 8222| 8191| 8165| 8403|    6616|   8226|     8352|
|   FATAL|   83|    80|      94|      72|     94|   98|   78|   70|   60|   16797|     92|       81|
|   DEBUG|41869| 42147|   41749|   41734|  41961|42085|41774|41652|41785|   33366|  41936|    41433|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



In [186]:
spark.sql("select loglevel, date_format(logtime, 'yyyy') as year from serverlogs").groupBy('loglevel').pivot('year').count().show()

+--------+-----+-----+-----+-----+-----+-----+
|loglevel| 2012| 2013| 2014| 2015| 2016| 2017|
+--------+-----+-----+-----+-----+-----+-----+
|    INFO|56964|57206|57343|57494|57254|56805|
|   ERROR| 7860| 7968| 8095| 8095| 8050| 8032|
|    WARN|16374|16098|16267|16155|16426|16324|
|   FATAL| 2925| 2991| 2920| 2974| 2965| 2924|
|   DEBUG|81914|82444|82386|82308|82581|81858|
+--------+-----+-----+-----+-----+-----+-----+



In [187]:
spark.sql("select loglevel, date_format(logtime, 'dd') as day from serverlogs").groupBy('loglevel').pivot('day').count().show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|   01|   02|   03|   04|   05|   06|   07|   08|   09|   10|   11|   12|   13|   14|   15|   16|   17|   18|   19|   20|   21|   22|   23|   24|   25|   26|   27|   28|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|12317|12477|12091|12436|12237|12221|12385|12172|12468|12080|12276|12372|12640|12252|12218|12276|12073|12295|12158|12015|12302|12003|12105|12431|12286|12148|12171|12161|
|   ERROR| 1695| 1737| 1771| 1771| 1763| 1747| 1735| 1648| 1700| 1668| 1660| 1712| 1699| 1783| 1750| 1777| 1753| 1694| 1694| 1636| 1770| 1718| 1701| 1685| 1777| 1689| 1664| 1703|
|    WARN| 3382| 3511| 3412| 3528| 3541| 3460| 3584| 3541| 3412| 3464| 3415| 3424| 3546| 3456| 3551| 3478

In [150]:
spark.stop()