In [None]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

In [None]:
/* specify schema for clickstream data */
import org.apache.spark.sql.types._

val clickdataSchema = StructType(Array(
      StructField("eventId", LongType, false),
      StructField("eventType", StringType, false),
      StructField("timestamp", StringType, false),
      StructField("ipaddress", StringType, false),
      StructField("sessionId", StringType, false),
      StructField("userId", StringType, false),
      StructField("pageUrl", StringType, false),
      StructField("browser", StringType, false)))

In [None]:
/** IBM Event Store imports and connection information */
import sys.process._
import scala.concurrent.{Await, Future}
import scala.concurrent.duration.Duration
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import com.ibm.bluspark.catalog.TableSchema
import com.ibm.bluspark.oltp.OLTPContext
import com.ibm.bluspark.example.DataGenerator
import com.ibm.bluspark.common.ConfigurationReader
import com.ibm.bluspark.oltp.InsertResult
import com.ibm.bluspark.example.BluSparkUtil
ConfigurationReader.setConnectionEndpoints("XX.XX.XX.XX:5555")

In [None]:
/** Connect to the IBM Event Store */
import java.io.File
import com.ibm.bluspark.oltp.OLTPContext
import org.apache.log4j.{Level, LogManager, Logger}
import org.apache.spark._
import org.apache.spark.sql.bluspark.BluSparkSQLContext

val sqlContext = new BluSparkSQLContext(spark.sparkContext, "CLICKDB", 1)
val table = sqlContext.loadBluTable("ClickStreamTable")

table.registerTempTable("ClickData")
val clickStreamDF = sqlContext.sql("select * from ClickData")
//clickStreamDF.show(5)

### Analyze Clickstream data

In [None]:
/* Calculate time_on_page */
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val timestamp = clickStreamDF("timestamp")
val next_timestamp = lead(timestamp, 1).over(Window.orderBy(timestamp))
val clickStreamWithTimeDF = clickStreamDF.withColumn("time", next_timestamp.cast(LongType) - timestamp.cast(LongType))
clickStreamWithTimeDF.show(5)

In [None]:
/* Calculate date on page_view */
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

clickStreamWithTimeDF.registerTempTable("tempData")
val clickStreamWithDateTimeDF = sqlContext.sql("select eventId, eventType, cast(from_unixtime(timestamp) as date), ipaddress,sessionId,userId,pageUrl,browser,time from tempData").
                            withColumnRenamed("CAST(from_unixtime(CAST(timestamp AS BIGINT), yyyy-MM-dd HH:mm:ss) AS DATE)","date")

//clickStreamWithDateTimeDF.show(5)

/* build aggregated web metrics from clickstream data*/
clickStreamWithDateTimeDF.registerTempTable("ClickData")
val clicksDF = sqlContext.sql("select pageURL, count(*) as page_hits, sum(time) as total_time from ClickData where eventType='pageView' group by pageURL")
clicksDF.show(5,false)

In [None]:
/* build aggregated web metrics by product_line, products and feature browses */
clicksDF.registerTempTable("WebMetricsData")
val metricsQuery = """select parse_URL(pageURL,'QUERY','product_line') as product_line, 
                        Coalesce(parse_URL(pageURL,'QUERY','action'),'') as action,
                        Coalesce(parse_URL(pageURL,'QUERY','product'),'') as product, 
                        Coalesce(parse_URL(pageURL,'QUERY','feature'),'') as feature, page_hits, total_time from WebMetricsData"""

val webMetricsDF = sqlContext.sql(metricsQuery).filter($"product_line".isNotNull).sort($"product_line".desc)
//webMetricsDF.show(5)

### Aggregated Web Metrics for All Product Lines

In [None]:
/* build aggregated web metrics per product line */
val productlineMetrics = webMetricsDF.select("product_line","page_hits","total_time").
    groupBy("product_line").agg(sum("page_hits"), sum("total_time")).
    withColumnRenamed("sum(page_hits)","page_hits").
    withColumnRenamed("sum(total_time)","total_time")

productlineMetrics.sort($"page_hits".desc).show(10)


In [None]:
%%brunel data('productlineMetrics') 
bar x(product_line) y(page_hits) tooltip(#all)color(product_line)legends(none) axes(x:'product lines',y:'page views')sort(page_hits)|
stack polar bar  y(total_time) color(product_line)label(product_line) legends(none) tooltip("time on page (sec): ",total_time)sort(page_hits) 
 :: width=1000, height=300

### Aggregated Web Metrics for Smart Phones

In [None]:
/* Visualize aggregated page hits and browse time */
val productMetrics = webMetricsDF.select("product_line","product","page_hits","total_time").filter($"action" === "details").filter($"product_line" === "smartphones").
    groupBy("product_line","product").agg(sum("page_hits"), sum("total_time")).
    withColumnRenamed("sum(page_hits)","page_hits").
    withColumnRenamed("sum(total_time)","total_time")
productMetrics.show()

In [None]:
%%brunel data('productMetrics') 
bar x(product) y(page_hits) tooltip(page_hits,product)color(product) legends(none) axes(x:'smart phones',y:'page views')sort(page_hits)|
stack polar bar  y(total_time) color(product)label(product)tooltip("time on page (sec): ",total_time) legends(none)sort(page_hits)
:: width=1000, height=300

### Aggregated Web Metrics on Smart Phone Features 

In [None]:
 /*Visualize aggregated page hits for Features */
val featureMetrics = webMetricsDF.select("product","feature","page_hits","total_time").filter($"action" === "details").filter($"product" === "A-phone").
    filter("feature != ''").groupBy("product","feature").agg(sum("page_hits"), sum("total_time")).
    withColumnRenamed("sum(page_hits)","page_hits").
    withColumnRenamed("sum(total_time)","total_time")

featureMetrics.show()

In [None]:
%%brunel data('featureMetrics') 
bar x(feature) y(page_hits) tooltip(feature,page_hits)color(feature) legends(none) axes(x:'A-phone features',y:'page views')sort(page_hits)interaction(select)|
stack polar bar  y(total_time) color(feature)label(feature) tooltip("time on page (sec): ",total_time) legends(none)sort(page_hits)opacity(#selection)
:: width=1000, height=300

### Web Metrics for user 'David'

In [None]:
/* get user web metrics from clickstream data */
val userClicksQuery ="""select pageURL,year(date) as year,month(date) as month,weekofyear(date) as week,day(date) as day,
                        count(*) as page_hits, sum(time) as total_time from ClickData where eventType='pageView' and
                        userId='datkins' group by pageURL, date"""
val userClicksDF = sqlContext.sql(userClicksQuery)        
userClicksDF.show(5)

In [None]:
/* build user web metrics by product_line, products and feature browses */
userClicksDF.registerTempTable("UserWebMetricsData")
val metricsQuery = """select month,week,day, parse_URL(pageURL,'QUERY','product_line') as product_line, 
                        Coalesce(parse_URL(pageURL,'QUERY','action'),'') as action,
                        Coalesce(parse_URL(pageURL,'QUERY','product'),'') as product, 
                        Coalesce(parse_URL(pageURL,'QUERY','feature'),'') as feature, page_hits, total_time from UserWebMetricsData
                        where year = '2017'"""

val userWebMetricsDF = sqlContext.sql(metricsQuery).filter($"product_line".isNotNull)
userWebMetricsDF.show(5)

In [None]:
/* visualize metrics for most recent week */
val weekMetricsDF = userWebMetricsDF.groupBy("day","product_line","action","product","feature","page_hits","total_time").max("week")
weekMetricsDF.show(5)

In [None]:
%%brunel data('weekMetricsDF') 
x(day)y(page_hits) stack bar sum (page_hits) color(product_line) tooltip(#all)axes(x:'Day of Month' ,y:'page views') axes(x:7) interaction(select)  |
stack polar bar  y(total_time) color(product_line)label(product) legends(none) tooltip(#all)opacity(#selection)
:: width=1000, height=300


##### Insight summary from Clickstream Analysis

1. Aggregated web metrics of recent months highlights significant interest in Smart phones with A-phones leading the pack. 
2. User 'David' is a repeat visitor and has explored Smart phones multiple times in recent days along with Computers and Headphones. 


###### sanne0611v4 