In [1]:
!hadoop fs -ls /public/trendytech/datasets

Found 18 items
-rw-r--r--   3 itv005857 supergroup 2362455918 2023-06-06 09:28 /public/trendytech/datasets/cust_transf.csv
drwxr-xr-x   - itv005857 supergroup          0 2023-05-18 17:40 /public/trendytech/datasets/customer_nested
-rw-r--r--   3 itv005857 supergroup       1319 2023-05-23 13:04 /public/trendytech/datasets/hospital.csv
-rw-r--r--   3 itv005857 supergroup       5697 2023-06-05 02:31 /public/trendytech/datasets/hotel_data.csv
-rw-r--r--   3 itv005857 supergroup        925 2023-05-23 13:05 /public/trendytech/datasets/library_data.json
-rw-r--r--   3 itv005857 supergroup   25487177 2023-06-11 04:35 /public/trendytech/datasets/logdata1m.csv
-rw-r--r--   3 itv005857 supergroup   46079587 2023-06-09 13:05 /public/trendytech/datasets/order_data.csv
drwxr-xr-x   - itv005857 supergroup          0 2023-06-02 02:47 /public/trendytech/datasets/orders
-rw-r--r--   3 itv005857 supergroup    7064041 2023-05-04 07:46 /public/trendytech/datasets/orders.json
-rw-r--r--   3 itv005857 superg

In [2]:
!hadoop fs -head /public/trendytech/datasets/logdata1m.csv

INFO,2015-8-8 20:49:22
WARN,2015-1-14 20:05:00
INFO,2017-6-14 00:08:35
INFO,2016-1-18 11:50:14
DEBUG,2017-7-1 12:55:02
INFO,2014-2-26 12:34:21
INFO,2015-7-12 11:13:47
INFO,2017-4-15 01:20:18
DEBUG,2016-11-2 20:19:23
INFO,2012-8-20 10:09:44
DEBUG,2014-4-22 21:30:49
WARN,2013-12-6 17:54:15
DEBUG,2017-1-12 10:47:02
DEBUG,2016-6-25 11:06:42
ERROR,2015-6-28 19:25:05
DEBUG,2012-6-24 01:06:37
INFO,2014-12-9 09:53:54
DEBUG,2015-11-8 19:20:08
INFO,2017-7-21 18:34:18
DEBUG,2014-12-26 06:38:42
DEBUG,2013-1-6 16:56:43
INFO,2015-10-8 11:33:25
INFO,2016-11-18 09:47:31
DEBUG,2015-2-6 16:24:07
WARN,2016-7-26 18:54:43
INFO,2012-10-18 14:35:19
DEBUG,2012-4-26 14:26:50
DEBUG,2013-9-28 20:27:13
INFO,2017-8-20 13:17:27
INFO,2015-4-13 09:28:17
DEBUG,2015-7-17 00:49:27
DEBUG,2014-7-26 02:33:09
INFO,2016-1-13 09:51:57
DEBUG,2015-1-14 08:55:30
DEBUG,2016-1-20 03:47:06
DEBUG,2013-7-8 21:00:50
DEBUG,2012-5-22 11:43:57
DEBUG,2013-3-20 06:14:50
DEBUG,2017-7-13 15:35:11
DEBUG,2013-1-21 20:20:25
DEBU

In [3]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [4]:
user = gp.getuser()
user

'itv005077'

In [5]:
spark = SparkSession.builder \
    .appName(f'{user}-Week-8-Assignment-4') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark

In [7]:
schema = T.StructType([
    T.StructField('log_level', T.StringType()),
    T.StructField('date_time', T.TimestampType()),
])

In [8]:
df_logs = spark.read \
    .format('csv') \
    .option('delimiter', ',') \
    .schema(schema) \
    .load('/public/trendytech/datasets/logdata1m.csv')

In [9]:
df_logs.show()

+---------+-------------------+
|log_level|          date_time|
+---------+-------------------+
|     INFO|2015-08-08 20:49:22|
|     WARN|2015-01-14 20:05:00|
|     INFO|2017-06-14 00:08:35|
|     INFO|2016-01-18 11:50:14|
|    DEBUG|2017-07-01 12:55:02|
|     INFO|2014-02-26 12:34:21|
|     INFO|2015-07-12 11:13:47|
|     INFO|2017-04-15 01:20:18|
|    DEBUG|2016-11-02 20:19:23|
|     INFO|2012-08-20 10:09:44|
|    DEBUG|2014-04-22 21:30:49|
|     WARN|2013-12-06 17:54:15|
|    DEBUG|2017-01-12 10:47:02|
|    DEBUG|2016-06-25 11:06:42|
|    ERROR|2015-06-28 19:25:05|
|    DEBUG|2012-06-24 01:06:37|
|     INFO|2014-12-09 09:53:54|
|    DEBUG|2015-11-08 19:20:08|
|     INFO|2017-07-21 18:34:18|
|    DEBUG|2014-12-26 06:38:42|
+---------+-------------------+
only showing top 20 rows



In [10]:
df_logs = df_logs.withColumn('month', F.date_format('date_time', 'MMMM'))

In [11]:
df_logs.show()

+---------+-------------------+--------+
|log_level|          date_time|   month|
+---------+-------------------+--------+
|     INFO|2015-08-08 20:49:22|  August|
|     WARN|2015-01-14 20:05:00| January|
|     INFO|2017-06-14 00:08:35|    June|
|     INFO|2016-01-18 11:50:14| January|
|    DEBUG|2017-07-01 12:55:02|    July|
|     INFO|2014-02-26 12:34:21|February|
|     INFO|2015-07-12 11:13:47|    July|
|     INFO|2017-04-15 01:20:18|   April|
|    DEBUG|2016-11-02 20:19:23|November|
|     INFO|2012-08-20 10:09:44|  August|
|    DEBUG|2014-04-22 21:30:49|   April|
|     WARN|2013-12-06 17:54:15|December|
|    DEBUG|2017-01-12 10:47:02| January|
|    DEBUG|2016-06-25 11:06:42|    June|
|    ERROR|2015-06-28 19:25:05|    June|
|    DEBUG|2012-06-24 01:06:37|    June|
|     INFO|2014-12-09 09:53:54|December|
|    DEBUG|2015-11-08 19:20:08|November|
|     INFO|2017-07-21 18:34:18|    July|
|    DEBUG|2014-12-26 06:38:42|December|
+---------+-------------------+--------+
only showing top

## Programatic Approach

In [14]:
df_logs \
.groupBy('log_level', 'month') \
.agg(F.count('*')) \
.orderBy('log_level', 'month') \
.show()

+---------+---------+--------+
|log_level|    month|count(1)|
+---------+---------+--------+
|    DEBUG|    April|   41869|
|    DEBUG|   August|   42147|
|    DEBUG| December|   41749|
|    DEBUG| February|   41734|
|    DEBUG|  January|   41961|
|    DEBUG|     July|   42085|
|    DEBUG|     June|   41774|
|    DEBUG|    March|   41652|
|    DEBUG|      May|   41785|
|    DEBUG| November|   33366|
|    DEBUG|  October|   41936|
|    DEBUG|September|   41433|
|    ERROR|    April|    4107|
|    ERROR|   August|    3987|
|    ERROR| December|    4106|
|    ERROR| February|    4013|
|    ERROR|  January|    4054|
|    ERROR|     July|    3976|
|    ERROR|     June|    4059|
|    ERROR|    March|    4122|
+---------+---------+--------+
only showing top 20 rows



## SPARK SQL Approach

In [15]:
df_logs.createOrReplaceTempView('loglevels')

In [16]:
spark.sql('''
    SELECT log_level, month, count(*)
    FROM loglevels
    GROUP BY log_level, month
    ORDER BY log_level, month
''') \
.show()

+---------+---------+--------+
|log_level|    month|count(1)|
+---------+---------+--------+
|    DEBUG|    April|   41869|
|    DEBUG|   August|   42147|
|    DEBUG| December|   41749|
|    DEBUG| February|   41734|
|    DEBUG|  January|   41961|
|    DEBUG|     July|   42085|
|    DEBUG|     June|   41774|
|    DEBUG|    March|   41652|
|    DEBUG|      May|   41785|
|    DEBUG| November|   33366|
|    DEBUG|  October|   41936|
|    DEBUG|September|   41433|
|    ERROR|    April|    4107|
|    ERROR|   August|    3987|
|    ERROR| December|    4106|
|    ERROR| February|    4013|
|    ERROR|  January|    4054|
|    ERROR|     July|    3976|
|    ERROR|     June|    4059|
|    ERROR|    March|    4122|
+---------+---------+--------+
only showing top 20 rows



# PIVOT TABLE

## Programatic Approach

In [17]:
# Sorting of Month is not possible in case of Pivot
df_logs \
.groupBy('log_level') \
.pivot('month') \
.agg(F.count('*')) \
.show()

+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|log_level|April|August|December|February|January| July| June|March|  May|November|October|September|
+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|     INFO|29302| 28993|   28874|   28983|  29119|29300|29143|29095|28900|   23301|  29018|    29038|
|    ERROR| 4107|  3987|    4106|    4013|   4054| 3976| 4059| 4122| 4086|    3389|   4040|     4161|
|     WARN| 8277|  8381|    8328|    8266|   8217| 8222| 8191| 8165| 8403|    6616|   8226|     8352|
|    FATAL|   83|    80|      94|      72|     94|   98|   78|   70|   60|   16797|     92|       81|
|    DEBUG|41869| 42147|   41749|   41734|  41961|42085|41774|41652|41785|   33366|  41936|    41433|
+---------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



## Optimization Technique

In [18]:
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [19]:
# Sorting of Month can be done using a local list in case of Pivot
df_logs \
.groupBy('log_level') \
.pivot('month', month_list) \
.agg(F.count('*')) \
.show()

+---------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|log_level|January|February|March|April|  May| June| July|August|September|October|November|December|
+---------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|     INFO|  29119|   28983|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|    ERROR|   4054|    4013| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|     WARN|   8217|    8266| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|    FATAL|     94|      72|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|    DEBUG|  41961|   41734|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+---------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+



In [20]:
spark.stop()