In [1]:
import findspark
findspark.init()

In [2]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [6]:
# define data path
data_path = "data"

DataFrame API

In [8]:
# load utilization.json
file_path = data_path + "/utlization.json"
df1 = spark.read.format("json").load(file_path)

In [9]:
# show df1
df1.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [11]:
# get summary statistics about DataFrame
df1.describe().show()

+-------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|        free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|             500000|             500000|             500000|            500000|            500000|
|   mean| 0.6205177399999957|               null|0.37912809999999864|             124.5|          69.59616|
| stddev|0.15875173872912945|               null|0.15830931278376276|14.430884120553191|14.850676696352851|
|    min|               0.22|03/05/2019 08:06:14|                0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|               0.78|               149|               105|
+-------+-------------------+-------------------+-------------------+------------------+------------------+



In [12]:
# get correlation between two columns
df1.stat.corr("cpu_utilization", "free_memory")

-0.4704771573080754

In [13]:
# get correlation between two columns
df1.stat.corr("session_count", "free_memory")

-0.5008320848876573

In [15]:
# get the most frequently occuring values of specific columns
df1.stat.freqItems(("server_id", "session_count")).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[137, 146, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [17]:
# extract sample of data, without replacement
df1_sample = df1.sample(withReplacement=False, fraction=0.05)
df1_sample.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.29|03/05/2019 09:16:14|        0.4|      100|           47|
|            0.4|03/05/2019 12:06:14|       0.36|      100|           53|
|           0.29|03/05/2019 13:36:14|       0.63|      100|           67|
|           0.46|03/05/2019 17:16:14|       0.69|      100|           41|
|           0.49|03/05/2019 17:41:14|        0.6|      100|           45|
|            0.4|03/05/2019 17:51:14|       0.51|      100|           53|
|           0.39|03/05/2019 19:26:14|       0.47|      100|           67|
|           0.47|03/05/2019 19:36:14|       0.72|      100|           42|
|           0.53|03/05/2019 20:26:14|       0.41|      100|           46|
|           0.56|03/05/2019 20:31:14|        0.7|      100|           61|
|           0.55|03/05/2019 22:11:14| 

Spark SQL API

In [18]:
# create temporary view utilization from df1
df1.createOrReplaceTempView("utilization")

In [20]:
# get summary statistics for a specific column
spark.sql("SELECT MIN(cpu_utilization), MAX(cpu_utilization), STDDEV(cpu_utilization) FROM utilization").show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|    0.15875173872912945|
+--------------------+--------------------+-----------------------+



In [21]:
# aggregate get summary statistics for a specific column 
spark.sql("SELECT server_id, MIN(cpu_utilization), MAX(cpu_utilization), STDDEV(cpu_utilization) \
          FROM utilization \
          GROUP BY server_id").show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      103|                0.56|                0.96|    0.11617507884178278|
|      104|                0.51|                0.91|    0.11521679513850511|
|      106|                0.22|                0.62|    0.11531539914568226|
|      100|                0.27|                0.67|     0.1152264191787964|
|      105|                0.29|                0.69|    0.11510721467869486|
|      101|                 0.6|                 1.0|    0.11651726263197697|
|      102|                0.56|                0.96|    0.11549678751286807|
|      112|                0.52|                0.92|    0.11528867845082576|
|      113|                0.58|                0.98|    0.11544345150353687|
|      110|                0.35|                0.75|    0.11533

In [23]:
# bucketing data from a specific column
spark.sql("SELECT FLOOR((cpu_utilization*100/10)) bucket, COUNT(*) \
          FROM utilization \
          GROUP BY bucket \
          ORDER BY bucket").show()

+------+--------+
|bucket|count(1)|
+------+--------+
|     2|    8186|
|     3|   37029|
|     4|   68046|
|     5|  104910|
|     6|  116725|
|     7|   88242|
|     8|   56598|
|     9|   20207|
|    10|      57|
+------+--------+

