# Spark EDA

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName('Spark EDA') \
                    .getOrCreate()

In [2]:
data_path = '/home/lorenzo/spark-repo/1_spark_dataframes/data/california.csv'

df1 = spark.read.option('header', 'True') \
                .option('inferSchema', 'True') \
                .csv(data_path)

df1 = df1.withColumnRenamed('latitude', 'lat') \
         .withColumnRenamed('longitude', 'lng') \
         .withColumnRenamed('total_rooms', 'rooms') \
         .withColumnRenamed('total_bedrooms', 'bedrooms') \
         .withColumnRenamed('median_income', 'income') \
         .withColumnRenamed('median_house_value', 'value') \
         .withColumnRenamed('housing_median_age', 'age') 

df1.createOrReplaceTempView('california')

### Descriptive statistics

**Describe() method:**

In [3]:
df1.select('rooms', 'income', 'age', 'value').describe().show()

+-------+------------------+------------------+------------------+------------------+
|summary|             rooms|            income|               age|             value|
+-------+------------------+------------------+------------------+------------------+
|  count|             20640|             20640|             20640|             20640|
|   mean|2635.7630813953488|3.8706710029070246|28.639486434108527|206855.81690891474|
| stddev|2181.6152515827944| 1.899821717945263| 12.58555761211163|115395.61587441359|
|    min|               2.0|            0.4999|               1.0|           14999.0|
|    max|           39320.0|           15.0001|              52.0|          500001.0|
+-------+------------------+------------------+------------------+------------------+



**Correlations:**

In [4]:
df1.stat.corr('rooms', 'value')

0.13415311380656275

In [5]:
df1 = df1.withColumn('rooms_per_person', df1.rooms/df1.population)

In [6]:
df1.stat.corr('rooms_per_person', 'value')

0.209481969006692

**Frequent values:**

In [7]:
df1.stat.freqItems(['age']).show()

+--------------------+
|       age_freqItems|
+--------------------+
|[23.0, 32.0, 41.0...|
+--------------------+



In [8]:
df1.stat.freqItems(['age', 'value']).show()

+--------------------+--------------------+
|       age_freqItems|     value_freqItems|
+--------------------+--------------------+
|[23.0, 32.0, 41.0...|[55000.0, 68700.0...|
+--------------------+--------------------+



**Crosstable:**

In [9]:
df1.filter(df1.age <= 10).stat.crosstab('ocean_proximity', 'age').show()

+-------------------+---+----+---+---+---+---+---+---+---+---+
|ocean_proximity_age|1.0|10.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|
+-------------------+---+----+---+---+---+---+---+---+---+---+
|             INLAND|  4| 151| 33| 33| 77|122| 80|104|112|108|
|           NEAR BAY|  0|  19|  4|  4|  9| 17|  7|  4|  7|  4|
|         NEAR OCEAN|  0|  16|  2|  2| 26| 20| 11| 18| 18| 23|
|          <1H OCEAN|  0|  78| 19| 23| 79| 85| 62| 49| 69| 70|
+-------------------+---+----+---+---+---+---+---+---+---+---+



**Bucketing:**

In [10]:
spark.sql("SELECT count(*), FLOOR(age*10) as age_bucket \
           FROM california GROUP BY age_bucket ORDER BY age_bucket").show()

+--------+----------+
|count(1)|age_bucket|
+--------+----------+
|       4|        10|
|      58|        20|
|      62|        30|
|     191|        40|
|     244|        50|
|     160|        60|
|     175|        70|
|     206|        80|
|     205|        90|
|     264|       100|
|     254|       110|
|     238|       120|
|     302|       130|
|     412|       140|
|     512|       150|
|     771|       160|
|     698|       170|
|     570|       180|
|     502|       190|
|     465|       200|
+--------+----------+
only showing top 20 rows



In [11]:
spark.sql("SELECT count(*), round(mean(value), 2), FLOOR(age*10) as age_bucket \
           FROM california GROUP BY age_bucket ORDER BY age_bucket").show()

+--------+--------------------+----------+
|count(1)|round(avg(value), 2)|age_bucket|
+--------+--------------------+----------+
|       4|            144300.0|        10|
|      58|           224475.91|        20|
|      62|           235643.58|        30|
|     191|           229235.14|        40|
|     244|           208417.66|        50|
|     160|           203794.39|        60|
|     175|           193296.03|        70|
|     206|           194414.58|        80|
|     205|            186672.7|        90|
|     264|            176580.7|       100|
|     254|            179907.9|       110|
|     238|           182046.24|       120|
|     302|           190181.81|       130|
|     412|            189597.1|       140|
|     512|           181808.42|       150|
|     771|           200180.56|       160|
|     698|           190494.29|       170|
|     570|           193354.77|       180|
|     502|           193197.84|       190|
|     465|           195659.82|       200|
+--------+-

### Time-series data

In [12]:
data_path = '/home/lorenzo/Desktop/utilization.csv'

df2 = spark.read.option('header', 'False') \
                .option('inferSchema', 'True') \
                .csv(data_path)

In [13]:
df2 = df2.withColumnRenamed("_c0", "event_datetime") \
            .withColumnRenamed ("_c1", "server_id") \
            .withColumnRenamed("_c2", "cpu_utilization") \
            .withColumnRenamed("_c3", "free_memory") \
            .withColumnRenamed("_c4", "session_count")

df2.createOrReplaceTempView('utilization')

In [14]:
df2.show(5)

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
+-------------------+---------+---------------+-----------+-------------+
only showing top 5 rows



In [15]:
spark.sql("SELECT server_id, min(cpu_utilization), max(cpu_utilization) \
           FROM utilization \
           GROUP BY server_id").show(5)

+---------+--------------------+--------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|
+---------+--------------------+--------------------+
|      148|                0.54|                0.94|
|      137|                0.54|                0.94|
|      133|                0.55|                0.95|
|      108|                0.55|                0.95|
|      101|                 0.6|                 1.0|
+---------+--------------------+--------------------+
only showing top 5 rows



**Windowing:**

In [16]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  avg(cpu_utilization) OVER (PARTITION BY server_id) as avg_cpu_util \
           FROM utilization").show(10)

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|      avg_cpu_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:07:41|      148|           0.85|0.7393840000000045|
|03/05/2019 08:12:41|      148|           0.94|0.7393840000000045|
|03/05/2019 08:17:41|      148|           0.89|0.7393840000000045|
|03/05/2019 08:22:41|      148|           0.74|0.7393840000000045|
|03/05/2019 08:27:41|      148|           0.63|0.7393840000000045|
|03/05/2019 08:32:41|      148|           0.89|0.7393840000000045|
|03/05/2019 08:37:41|      148|           0.77|0.7393840000000045|
|03/05/2019 08:42:41|      148|           0.59|0.7393840000000045|
|03/05/2019 08:47:41|      148|           0.77|0.7393840000000045|
|03/05/2019 08:52:41|      148|           0.71|0.7393840000000045|
+-------------------+---------+---------------+------------------+
only showing top 10 rows



In [17]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  avg(cpu_utilization) OVER (PARTITION BY server_id) as avg_cpu_util, \
                  cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id) as delta_cpu_util\
           FROM utilization").show(10)

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|      avg_cpu_util|      delta_cpu_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:07:41|      148|           0.85|0.7393840000000045|  0.1106159999999955|
|03/05/2019 08:12:41|      148|           0.94|0.7393840000000045| 0.20061599999999546|
|03/05/2019 08:17:41|      148|           0.89|0.7393840000000045| 0.15061599999999553|
|03/05/2019 08:22:41|      148|           0.74|0.7393840000000045| 6.15999999995509E-4|
|03/05/2019 08:27:41|      148|           0.63|0.7393840000000045|-0.10938400000000448|
|03/05/2019 08:32:41|      148|           0.89|0.7393840000000045| 0.15061599999999553|
|03/05/2019 08:37:41|      148|           0.77|0.7393840000000045|0.030615999999995536|
|03/05/2019 08:42:41|      148|           0.59|0.7393840000000045| -0.1493840000000045|
|03/05/2019 08:47:41|      148| 

**Moving window:**

In [18]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime  \
                        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as avg_cpu_util \
           FROM utilization").show(10)

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|      avg_cpu_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:07:41|      148|           0.85|             0.895|
|03/05/2019 08:12:41|      148|           0.94|0.8933333333333334|
|03/05/2019 08:17:41|      148|           0.89|0.8566666666666668|
|03/05/2019 08:22:41|      148|           0.74|0.7533333333333333|
|03/05/2019 08:27:41|      148|           0.63|0.7533333333333334|
|03/05/2019 08:32:41|      148|           0.89|0.7633333333333333|
|03/05/2019 08:37:41|      148|           0.77|              0.75|
|03/05/2019 08:42:41|      148|           0.59|              0.71|
|03/05/2019 08:47:41|      148|           0.77|              0.69|
|03/05/2019 08:52:41|      148|           0.71|0.7766666666666667|
+-------------------+---------+---------------+------------------+
only showing top 10 rows

