Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1 lines (1 sloc) 14.3 KB
{"paragraphs":[{"title":"Load CSV Library","text":"%dep\nz.load(\"com.databricks:spark-csv_2.10:1.3.0\")\n","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772530_-1001461007","id":"20160504-073612_1315230595","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:34:04 AM","dateFinished":"May 10, 2016 9:34:09 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:54","errorMessage":"","focus":true},{"title":"Load training dataset","text":"val df = sqlContext.read\n .format(\"com.databricks.spark.csv\")\n .option(\"header\", \"true\")\n .option(\"inferSchema\", \"false\")\n .load(\"hdfs://10.8.1.116/data/avazu_ctr/train\")\n\ndf.cache()\n","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_1663282863","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:34:06 AM","dateFinished":"May 10, 2016 9:34:17 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:55","errorMessage":"","focus":true},{"title":"View sample data","text":"df.show(10)\n","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_808733579","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:34:10 AM","dateFinished":"May 10, 2016 9:34:36 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:56","errorMessage":"","focus":true},{"title":"Dataset size","text":"val totalCount = df.count()","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_1302517922","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:34:17 AM","dateFinished":"May 10, 2016 9:35:11 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:57","errorMessage":"","focus":true},{"title":"Calculate CTR","text":"val clicks = df.filter(\"click = 1\").count()\nval ctr = clicks.toFloat / totalCount \n","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","lineNumbers":false,"title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_47433414","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:34:37 AM","dateFinished":"May 10, 2016 9:35:13 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:58","errorMessage":"","focus":true},{"title":"Register SQL table","text":"df.registerTempTable(\"training\")","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_2096842469","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:12 AM","dateFinished":"May 10, 2016 9:35:13 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:59","errorMessage":"","focus":true},{"title":"Explore device_conn_type ","text":"%sql\nSELECT device_conn_type, SUM(click) as clicks_num, COUNT(click) as impression, SUM(click)/COUNT(click) as ctr\nFROM training\nGROUP BY device_conn_type \n","dateUpdated":"May 10, 2016 9:34:04 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[{"name":"device_conn_type","index":0,"aggr":"sum"}],"values":[{"name":"impression","index":2,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"device_conn_type","index":0,"aggr":"sum"}}},"enabled":true,"editorMode":"ace/mode/sql","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_1853960930","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:13 AM","dateFinished":"May 10, 2016 9:35:15 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:60","errorMessage":"","focus":true},{"title":"Features cardinality ","text":"df.columns.map(c => (c, df.select(c).distinct().count()))\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[{"name":"click_uniq","index":0,"aggr":"sum"}],"values":[],"groups":[],"scatter":{"xAxis":{"name":"click_uniq","index":0,"aggr":"sum"}}},"enabled":true,"editorMode":"ace/mode/sql","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_901615704","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:13 AM","dateFinished":"May 10, 2016 9:35:39 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:61","errorMessage":"","focus":true},{"title":"Exploring C15 and C16 (ad size)","text":"%sql\nSELECT C15, C16, COUNT(click) as impression, SUM(click)/COUNT(click) as ctr\nFROM training\nGROUP BY C15, C16 \nORDER BY ctr DESC\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[{"name":"C15","index":0,"aggr":"sum"}],"values":[{"name":"C16","index":1,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"C15","index":0,"aggr":"sum"},"yAxis":{"name":"C16","index":1,"aggr":"sum"}}},"enabled":true,"editorMode":"ace/mode/sql","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772532_-1003769500","id":"20160504-073612_700745183","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:15 AM","dateFinished":"May 10, 2016 9:35:40 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:62","errorMessage":"","focus":true},{"title":"Date parse function","text":" import java.text.SimpleDateFormat\r\n import java.util.{Calendar, Date}\r\n import org.apache.spark.sql.DataFrame\r\n \r\n object DateUtils {\r\n val dateFormat = new ThreadLocal[SimpleDateFormat]() {\r\n override def initialValue(): SimpleDateFormat = new SimpleDateFormat(\"yyMMddHH\")\r\n }\r\n\r\n def parse(s: String, field: Int): Int = {\r\n val date = dateFormat.get().parse(s)\r\n val cal = Calendar.getInstance()\r\n cal.setTime(date)\r\n cal.get(field)\r\n }\r\n }\r\n \r\n \r\n def transformHour(df: DataFrame): DataFrame = {\r\n val toYear = udf[Int, String](s => DateUtils.parse(s, Calendar.YEAR))\r\n val toMonth = udf[Int, String](s => DateUtils.parse(s, Calendar.MONTH))\r\n val toDay = udf[Int, String](s => DateUtils.parse(s, Calendar.DAY_OF_MONTH))\r\n val toHour = udf[Int, String](s => DateUtils.parse(s, Calendar.HOUR_OF_DAY))\r\n\r\n df.withColumn(\"time_year\", toYear(df(\"hour\")))\r\n .withColumn(\"time_month\", toMonth(df(\"hour\")))\r\n .withColumn(\"time_day\", toDay(df(\"hour\")))\r\n .withColumn(\"time_hour\", toHour(df(\"hour\")))\r\n .drop(\"hour\")\r\n }\r\n \r\n \r\n ","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","lineNumbers":false,"title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772533_-1004154249","id":"20160504-073612_1382592175","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:39 AM","dateFinished":"May 10, 2016 9:35:41 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:63","errorMessage":"","focus":true},{"title":"Transform date","text":"val hourDecoded = transformHour(df)\nhourDecoded.cache()\nhourDecoded.show(10)\n\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772533_-1004154249","id":"20160504-073612_671191709","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:41 AM","dateFinished":"May 10, 2016 9:36:00 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:64","errorMessage":"","focus":true},{"title":"Month and Year unique values","text":"hourDecoded.select(\"time_month\").distinct.count()\nhourDecoded.select(\"time_year\").distinct.count()\n\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772533_-1004154249","id":"20160504-073612_1525194571","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:35:41 AM","dateFinished":"May 10, 2016 9:36:34 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:65","errorMessage":"","focus":true},{"title":"Remove Month and Year columns","text":"val hourDecoded2 = hourDecoded.drop(\"time_month\").drop(\"time_year\")\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772533_-1004154249","id":"20160504-073612_2128182257","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:36:00 AM","dateFinished":"May 10, 2016 9:36:34 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:66","errorMessage":"","focus":true},{"title":"Cast Click column type","text":"import org.apache.spark.sql.types.DoubleType\r\n\r\nval prepared = hourDecoded2\r\n .withColumn(\"clickTmp\", hourDecoded2(\"click\").cast(DoubleType))\r\n .drop(\"click\")\r\n .withColumnRenamed(\"clickTmp\", \"click\") \r\n ","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462530981184_1841774640","id":"20160506-063621_105850541","dateCreated":"May 6, 2016 6:36:21 AM","dateStarted":"May 10, 2016 9:36:34 AM","dateFinished":"May 10, 2016 9:36:35 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:67","errorMessage":"","focus":true},{"title":"Count rows for Day 21","text":"prepared.filter(\"time_day = 21\").count()\n\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462361772533_-1004154249","id":"20160504-073612_1364486404","dateCreated":"May 4, 2016 7:36:12 AM","dateStarted":"May 10, 2016 9:36:35 AM","dateFinished":"May 10, 2016 9:36:35 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:68","errorMessage":"","focus":true},{"title":"Save Day 21 to Data Grid","text":"import org.apache.spark.sql.insightedge._\r\nimport org.apache.spark.sql.SaveMode\r\n\r\nprepared.filter(\"time_day = 21\").write.mode(SaveMode.Overwrite).grid.save(\"day_21\")\r\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462374304307_-571137034","id":"20160504-110504_40474412","dateCreated":"May 4, 2016 11:05:04 AM","dateStarted":"May 10, 2016 9:36:35 AM","dateFinished":"May 10, 2016 9:37:01 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:69","errorMessage":"","focus":true},{"title":"Load, transform and Save training dataset","text":"val testDf = sqlContext.read\n .format(\"com.databricks.spark.csv\")\n .option(\"header\", \"true\")\n .option(\"inferSchema\", \"false\")\n .load(\"hdfs://10.8.1.116/data/avazu_ctr/test\")\n\n\ntransformHour(testDf)\n .drop(\"time_month\")\n .drop(\"time_year\")\n .write.mode(SaveMode.Overwrite).grid.save(\"test\")\n","dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala","title":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462376172992_-2004460792","id":"20160504-113612_2045638973","dateCreated":"May 4, 2016 11:36:12 AM","dateStarted":"May 10, 2016 9:36:36 AM","dateFinished":"May 10, 2016 9:37:42 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:70","errorMessage":"","focus":true},{"dateUpdated":"May 10, 2016 9:34:05 AM","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1462885741412_394305679","id":"20160510-090901_495144424","result":{"code":"SUCCESS","type":"TEXT"},"dateCreated":"May 10, 2016 9:09:01 AM","dateStarted":"May 10, 2016 9:37:02 AM","dateFinished":"May 10, 2016 9:37:42 AM","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:71"}],"name":"CTR demo","id":"2BH16UXNM","angularObjects":{"2BH6DZEHD":[],"2BKT5CBBS":[]},"config":{"looknfeel":"default"},"info":{}}