# Spark ML

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '/home/lorenzo/Desktop/utilization.csv'

df = spark.read.option('header', 'False') \
                .option('inferSchema', 'True') \
                .csv(data_path)

In [4]:
df = df.withColumnRenamed("_c0", "event_datetime") \
            .withColumnRenamed ("_c1", "server_id") \
            .withColumnRenamed("_c2", "cpu_utilization") \
            .withColumnRenamed("_c3", "free_memory") \
            .withColumnRenamed("_c4", "session_count")

df.createOrReplaceTempView('utilization')

### Vectorize data

In [5]:
va = VectorAssembler(inputCols=['cpu_utilization', 'free_memory', 'session_count'], 
                              outputCol = 'features')

In [6]:
vcluster_df = va.transform(df)

In [7]:
vcluster_df.show(5)

+-------------------+---------+---------------+-----------+-------------+----------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|        features|
+-------------------+---------+---------------+-----------+-------------+----------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|[0.57,0.51,47.0]|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|[0.47,0.62,43.0]|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|[0.56,0.57,62.0]|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|[0.57,0.56,50.0]|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|[0.35,0.46,43.0]|
+-------------------+---------+---------------+-----------+-------------+----------------+
only showing top 5 rows



### K-Means clustering
Spark ML library implementation of Kmeans expects to find a **features** column in the dataset that is provided to the fit function. This column should be the result of a vector assembler transformation.

In [8]:
km = KMeans().setK(3).setSeed(1)

In [9]:
km_output = km.fit(vcluster_df)

In [12]:
km_output.clusterCenters()

[array([ 0.52047775,  0.47836303, 51.79927162]),
 array([ 0.71931575,  0.28104316, 88.23965784]),
 array([ 0.62881549,  0.37094643, 70.43030159])]

### Linear Regression

In [13]:
va = VectorAssembler(inputCols=['cpu_utilization', 'free_memory'], 
                              outputCol = 'features')
reg_df = va.transform(df)
reg_df.show(5)

+-------------------+---------+---------------+-----------+-------------+-----------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|   features|
+-------------------+---------+---------------+-----------+-------------+-----------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|[0.57,0.51]|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|[0.47,0.62]|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|[0.56,0.57]|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|[0.57,0.56]|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|[0.35,0.46]|
+-------------------+---------+---------------+-----------+-------------+-----------+
only showing top 5 rows



In [14]:
lr = LinearRegression(featuresCol='features', labelCol='session_count')
lr_output = lr.fit(reg_df)

In [15]:
lr_output.coefficients

DenseVector([32.0832, -31.8455])

In [16]:
lr_output.intercept

61.761499518865996

In [17]:
lr_output.summary.r2

0.3424214623841483

In [18]:
lr_output.summary.rootMeanSquaredError

12.04258233312084