- analyze a Google Merchandise Store (also known as GStore, where Google swag is sold) customer dataset to predict revenue per customer. 
- we're predicting the natural log of the total revenue per unique user

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
import pandas as pd
import numpy as np
import json

In [4]:
# File location and type
file_location = "/FileStore/tables/train.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option('quote', '"') \
  .option('escape', '"') \
  .load(file_location)

display(train)

channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western Asia"", ""country"": ""Turkey"", ""region"": ""Izmir"", ""metro"": ""(not set)"", ""city"": ""Izmir"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""ttnet.com.tr"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",1131660440785968503_1472830385,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472830385,1,1472830385
Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Australasia"", ""country"": ""Australia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""dodo.net.au"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",377306020877927890_1472880147,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472880147,1,1472880147
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",3895546263509774583,"{""continent"": ""Europe"", ""subContinent"": ""Southern Europe"", ""country"": ""Spain"", ""region"": ""Community of Madrid"", ""metro"": ""(not set)"", ""city"": ""Madrid"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",3895546263509774583_1472865386,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472865386,1,1472865386
Organic Search,20160902,"{""browser"": ""UC Browser"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Linux"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",4763447161404445595,"{""continent"": ""Asia"", ""subContinent"": ""Southeast Asia"", ""country"": ""Indonesia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",4763447161404445595_1472881213,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""google + online"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472881213,1,1472881213
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Android"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": true, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""mobile""}",27294437909732085,"{""continent"": ""Europe"", ""subContinent"": ""Northern Europe"", ""country"": ""United Kingdom"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",27294437909732085_1472822600,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}, ""isTrueDirect"": true}",1472822600,2,1472822600
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",2938943183656635653,"{""continent"": ""Europe"", ""subContinent"": ""Southern Europe"", ""country"": ""Italy"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""fastwebnet.it"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",2938943183656635653_1472807194,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472807194,1,1472807194
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",1905672039242460897,"{""continent"": ""Asia"", ""subContinent"": ""Southern Asia"", ""country"": ""Pakistan"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",1905672039242460897_1472817241,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472817241,1,1472817241
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",537222803633850821,"{""continent"": ""Oceania"", ""subContinent"": ""Australasia"", ""country"": ""Australia"", ""region"": ""Queensland"", ""metro"": ""(not set)"", ""city"": ""Brisbane"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""bigpond.net.au"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",537222803633850821_1472812602,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472812602,1,1472812602
Organic Search,20160902,"{""browser"": ""Internet Explorer"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",4445454811831400414,"{""continent"": ""Europe"", ""subContinent"": ""Western Europe"", ""country"": ""Austria"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""spar.at"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",4445454811831400414_1472805784,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472805784,1,1472805784
Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",9499785259412240342,"{""continent"": ""Europe"", ""subContinent"": ""Western Europe"", ""country"": ""Netherlands"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""chello.nl"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",9499785259412240342_1472812272,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472812272,1,1472812272


In [5]:
# get the length of all json columns 
def udf_calculate_json_len(col):
  j_col = json.loads(col)
  return len(j_col)
udf_calculate_json_len = udf(udf_calculate_json_len, IntegerType())



display(
  train.select('device', udf_calculate_json_len('device').alias('len')).sort(desc('len'))
)

device,len
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""UC Browser"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Linux"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Android"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": true, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""mobile""}",16
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Internet Explorer"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16
"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",16


In [6]:

geoNetworkSchema = StructType(
  [
    StructField("continent", StringType()),
    StructField("subContinent", StringType()),
    StructField("country", StringType()),
    StructField("region", StringType()),
    StructField("metro", StringType()),
    StructField("city", StringType()),
    StructField("cityId", StringType()),
    StructField("networkDomain", StringType()),
    StructField("latitude", StringType()),
    StructField("longitude", StringType()),
    StructField("networkLocation", StringType()),
  ]
)


deviceSchema = StructType(
  [
    StructField("browser", StringType()),
    StructField("browserVersion", StringType()),
    StructField("browserSize", StringType()),
    StructField("operatingSystem", StringType()),
    StructField("operatingSystemVersion", StringType()),
    StructField("isMobile", StringType()),
    StructField("mobileDeviceBranding", StringType()),
    StructField("mobileDeviceModel", StringType()),
    StructField("mobileInputSelector", StringType()),
    StructField("mobileDeviceInfo", StringType()),
    StructField("mobileDeviceMarketingName", StringType()),
    StructField("flashVersion", StringType()),
    StructField("language", StringType()),
    StructField("screenColors", StringType()),
    StructField("screenResolution", StringType()),
    StructField("deviceCategory", StringType()),
  ]
)

In [7]:
totals_schema = StructType([ StructField("visits", StringType(), True),
                     StructField("hits", StringType(), True),
                     StructField("pageviews", StringType(), True),
                     StructField("bounces", StringType(), True),
                     StructField("transactionRevenue", StringType(), True),
                     StructField("newVisits", StringType(), True)
                        ])


trafficSource_schema = StructType(
  [
    StructField("campaign", StringType(), True),
    StructField("source", StringType(), True),
    StructField("medium", StringType(), True),
    StructField("keyword", StringType(), True),
    StructField("adContent", StringType(), True),
    StructField("adwordsClickInfo", StructType(
      [
        StructField('page', StringType() , True),
        StructField('slot', StringType() , True),
        StructField('criteriaParameters', StringType() , True),
        StructField('gclId', StringType() , True),
        StructField('adNetworkType', StringType() , True),
        StructField('isVideoAd', StringType() , True),
      ]
    ), True),
    StructField('isTrueDirect', StringType() , True)
  ]
)

In [8]:
# train2 = train.withColumn("_totals", from_json(train["totals"], totals_schema))
train = train.withColumn("_trafficSource", from_json(train["trafficSource"], trafficSource_schema)
).withColumn("_totals", from_json(train["totals"], totals_schema
)).withColumn("_device", from_json(train["device"], deviceSchema
)).withColumn("_geoNetwork", from_json(train["geoNetwork"], geoNetworkSchema
))
                        
display(train)                     

channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime,_trafficSource,_totals,_device,_geoNetwork
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western Asia"", ""country"": ""Turkey"", ""region"": ""Izmir"", ""metro"": ""(not set)"", ""city"": ""Izmir"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""ttnet.com.tr"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",1131660440785968503_1472830385,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472830385,1,1472830385,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Chrome, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Asia, Western Asia, Turkey, Izmir, (not set), Izmir, not available in demo dataset, ttnet.com.tr, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Australasia"", ""country"": ""Australia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""dodo.net.au"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",377306020877927890_1472880147,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472880147,1,1472880147,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Firefox, not available in demo dataset, not available in demo dataset, Macintosh, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Oceania, Australasia, Australia, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, dodo.net.au, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",3895546263509774583,"{""continent"": ""Europe"", ""subContinent"": ""Southern Europe"", ""country"": ""Spain"", ""region"": ""Community of Madrid"", ""metro"": ""(not set)"", ""city"": ""Madrid"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",3895546263509774583_1472865386,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472865386,1,1472865386,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Chrome, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Europe, Southern Europe, Spain, Community of Madrid, (not set), Madrid, not available in demo dataset, unknown.unknown, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""UC Browser"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Linux"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",4763447161404445595,"{""continent"": ""Asia"", ""subContinent"": ""Southeast Asia"", ""country"": ""Indonesia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",4763447161404445595_1472881213,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""google + online"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472881213,1,1472881213,"List((not set), google, organic, google + online, null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(UC Browser, not available in demo dataset, not available in demo dataset, Linux, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Asia, Southeast Asia, Indonesia, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, unknown.unknown, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Android"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": true, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""mobile""}",27294437909732085,"{""continent"": ""Europe"", ""subContinent"": ""Northern Europe"", ""country"": ""United Kingdom"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",27294437909732085_1472822600,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}, ""isTrueDirect"": true}",1472822600,2,1472822600,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), true)","List(1, 1, 1, 1, null, null)","List(Chrome, not available in demo dataset, not available in demo dataset, Android, not available in demo dataset, true, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, mobile)","List(Europe, Northern Europe, United Kingdom, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, unknown.unknown, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",2938943183656635653,"{""continent"": ""Europe"", ""subContinent"": ""Southern Europe"", ""country"": ""Italy"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""fastwebnet.it"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",2938943183656635653_1472807194,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472807194,1,1472807194,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Chrome, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Europe, Southern Europe, Italy, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, fastwebnet.it, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",1905672039242460897,"{""continent"": ""Asia"", ""subContinent"": ""Southern Asia"", ""country"": ""Pakistan"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",1905672039242460897_1472817241,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472817241,1,1472817241,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Chrome, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Asia, Southern Asia, Pakistan, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, unknown.unknown, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",537222803633850821,"{""continent"": ""Oceania"", ""subContinent"": ""Australasia"", ""country"": ""Australia"", ""region"": ""Queensland"", ""metro"": ""(not set)"", ""city"": ""Brisbane"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""bigpond.net.au"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",537222803633850821_1472812602,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472812602,1,1472812602,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Chrome, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Oceania, Australasia, Australia, Queensland, (not set), Brisbane, not available in demo dataset, bigpond.net.au, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Internet Explorer"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",4445454811831400414,"{""continent"": ""Europe"", ""subContinent"": ""Western Europe"", ""country"": ""Austria"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""spar.at"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",4445454811831400414_1472805784,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472805784,1,1472805784,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Internet Explorer, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Europe, Western Europe, Austria, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, spar.at, not available in demo dataset, not available in demo dataset, not available in demo dataset)"
Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",9499785259412240342,"{""continent"": ""Europe"", ""subContinent"": ""Western Europe"", ""country"": ""Netherlands"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""chello.nl"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",9499785259412240342_1472812272,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472812272,1,1472812272,"List((not set), google, organic, (not provided), null, List(null, null, not available in demo dataset, null, null, null), null)","List(1, 1, 1, 1, null, 1)","List(Firefox, not available in demo dataset, not available in demo dataset, Windows, not available in demo dataset, false, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, desktop)","List(Europe, Western Europe, Netherlands, not available in demo dataset, not available in demo dataset, not available in demo dataset, not available in demo dataset, chello.nl, not available in demo dataset, not available in demo dataset, not available in demo dataset)"


In [9]:
train_exploded = train.select('fullVisitorId','channelGrouping', 
                         'date', 
                         col('_device.*'),
                         'fullVisitorId',
                         col('_geoNetwork.*'),
                         'sessionId',
                         'socialEngagementType',
                         col('_totals.*'),
                         col('_trafficSource.*'),
                         'visitId',
                         'visitNumber',
                         'visitStartTime'
                             ) 

display(train_exploded)

fullVisitorId,channelGrouping,date,browser,browserVersion,browserSize,operatingSystem,operatingSystemVersion,isMobile,mobileDeviceBranding,mobileDeviceModel,mobileInputSelector,mobileDeviceInfo,mobileDeviceMarketingName,flashVersion,language,screenColors,screenResolution,deviceCategory,fullVisitorId.1,continent,subContinent,country,region,metro,city,cityId,networkDomain,latitude,longitude,networkLocation,sessionId,socialEngagementType,visits,hits,pageviews,bounces,transactionRevenue,newVisits,campaign,source,medium,keyword,adContent,adwordsClickInfo,isTrueDirect,visitId,visitNumber,visitStartTime
1131660440785968503,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,1131660440785968503,Asia,Western Asia,Turkey,Izmir,(not set),Izmir,not available in demo dataset,ttnet.com.tr,not available in demo dataset,not available in demo dataset,not available in demo dataset,1131660440785968503_1472830385,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472830385,1,1472830385
377306020877927890,Organic Search,20160902,Firefox,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,377306020877927890,Oceania,Australasia,Australia,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,dodo.net.au,not available in demo dataset,not available in demo dataset,not available in demo dataset,377306020877927890_1472880147,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472880147,1,1472880147
3895546263509774583,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,3895546263509774583,Europe,Southern Europe,Spain,Community of Madrid,(not set),Madrid,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset,3895546263509774583_1472865386,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472865386,1,1472865386
4763447161404445595,Organic Search,20160902,UC Browser,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,4763447161404445595,Asia,Southeast Asia,Indonesia,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset,4763447161404445595_1472881213,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,google + online,,"List(null, null, not available in demo dataset, null, null, null)",,1472881213,1,1472881213
27294437909732085,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile,27294437909732085,Europe,Northern Europe,United Kingdom,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset,27294437909732085_1472822600,Not Socially Engaged,1,1,1,1.0,,,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",True,1472822600,2,1472822600
2938943183656635653,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,2938943183656635653,Europe,Southern Europe,Italy,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,fastwebnet.it,not available in demo dataset,not available in demo dataset,not available in demo dataset,2938943183656635653_1472807194,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472807194,1,1472807194
1905672039242460897,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,1905672039242460897,Asia,Southern Asia,Pakistan,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset,1905672039242460897_1472817241,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472817241,1,1472817241
537222803633850821,Organic Search,20160902,Chrome,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,537222803633850821,Oceania,Australasia,Australia,Queensland,(not set),Brisbane,not available in demo dataset,bigpond.net.au,not available in demo dataset,not available in demo dataset,not available in demo dataset,537222803633850821_1472812602,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472812602,1,1472812602
4445454811831400414,Organic Search,20160902,Internet Explorer,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,4445454811831400414,Europe,Western Europe,Austria,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,spar.at,not available in demo dataset,not available in demo dataset,not available in demo dataset,4445454811831400414_1472805784,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472805784,1,1472805784
9499785259412240342,Organic Search,20160902,Firefox,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,9499785259412240342,Europe,Western Europe,Netherlands,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,chello.nl,not available in demo dataset,not available in demo dataset,not available in demo dataset,9499785259412240342_1472812272,Not Socially Engaged,1,1,1,1.0,,1.0,(not set),google,organic,(not provided),,"List(null, null, not available in demo dataset, null, null, null)",,1472812272,1,1472812272


In [10]:
print("# of row" , train_exploded.count())
print("# of cols" , len(train_exploded.columns))

In [11]:
selected_features = ['fullVisitorId','bounces', 'visits', 'newVisits', 'hits', 'pageviews' , 'visitNumber', 'transactionRevenue']

train_selected = train_exploded.select(selected_features)
  
#  'channelGrouping',
#  'date',
#  'fullVisitorId',
#  'sessionId',
#  'visitId',
#  'visitNumber',
#  'visitStartTime',
#  'browser',
#  'deviceCategory',
#  'isMobile',
#  'operatingSystem',
#  'city',
#  'continent',
#  'country',
#  'metro',
#  'networkDomain',
#  'region',
#  'subContinent',
#  'bounces',
#  'hits',
#  'visits',
#  'newVisits',
#  'pageviews',
#  'transactionRevenue',
#  'adContent',
#  'campaign',
#  'isTrueDirect',
#  'keyword',
#  'medium',
#  'source')

print("# of row" , train_selected.count())
print("# of cols" , len(train_selected.columns))

In [12]:
# train_fillna_revenue = train_selected.select("transactionRevenue").fillna(0.0)
# df.fillna( { 'a':0, 'b':0 } )

train_fillna = train_selected.fillna( { "transactionRevenue":0 , 
                             "visitNumber" : 0 , 
                             "bounces" : 0 , 
                             "visits" : 0 , 
                             "newVisits" : 0 , 
                             "hits" : 0 , 
                             "pageviews" : 0
                            } )

train_fillna.filter(col("transactionRevenue").isNull()).count()

In [13]:
train_convert_to_double = train_fillna.withColumn("visitNumber_flt", col("visitNumber").cast("double")
).withColumn("bounces_flt", col("bounces").cast("double")
).withColumn("visits_flt", col("visits").cast("double")
).withColumn("newVisits_flt", col("newVisits").cast("double")
).withColumn("hits_flt", col("hits").cast("double")
).withColumn("pageviews_flt", col("pageviews").cast("double")
).withColumn("transactionRevenue_flt", col("transactionRevenue").cast("double"))

train_convert_to_double.printSchema()

In [14]:
train_groupby = train_convert_to_double.groupby('fullVisitorId').sum()

train_groupby_renamed = train_groupby.withColumnRenamed("sum(visitNumber_flt)", "visitNumber_flt"
).withColumnRenamed("sum(bounces_flt)", "bounces_flt"
).withColumnRenamed("sum(bounces_flt)", "bounces_flt"
).withColumnRenamed("sum(visits_flt)", "visits_flt"
).withColumnRenamed("sum(newVisits_flt)", "newVisits_flt"                   
).withColumnRenamed("sum(hits_flt)", "hits_flt"                      
).withColumnRenamed("sum(pageviews_flt)", "pageviews_flt"                     
).withColumnRenamed("sum(transactionRevenue_flt)", "transactionRevenue_flt")                       
                    
display(train_groupby_renamed)

fullVisitorId,visitNumber_flt,bounces_flt,visits_flt,newVisits_flt,hits_flt,pageviews_flt,transactionRevenue_flt
6940815913920553634,1.0,1.0,1.0,1.0,1.0,1.0,0.0
9169995120881264768,6.0,2.0,3.0,1.0,4.0,4.0,0.0
7019324769188325239,6.0,1.0,3.0,1.0,9.0,9.0,0.0
4182863769903710608,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5194629126653011632,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3307158041295976924,55.0,9.0,10.0,1.0,18.0,17.0,0.0
3216697237390563528,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3805485490023701730,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3611676034400073896,3.0,0.0,2.0,1.0,9.0,8.0,0.0
1718327138234260099,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [15]:
# categorical_feature_encoded = ['channelGroupingIndex', 'browserIndex', 'deviceCategoryIndex', 'isMobileIndex' , 'countryIndex' , 'mediumIndex', 'sourceIndex']
# categorical_feature_encoded = ['channelGrouping', 'browser', 'deviceCategory', 'isMobile' , 'country' , 'medium', 'source']

numerical_features = ['bounces_flt', 'visits_flt', 'newVisits_flt', 'hits_flt', 'pageviews_flt' , 'visitNumber_flt']

# feature_columns = categorical_feature_encoded + numerical_features 
feature_columns = numerical_features

label_column = ['transactionRevenue_flt']


In [16]:
train_ = train_groupby_renamed.select(feature_columns + label_column)
display(train_)

bounces_flt,visits_flt,newVisits_flt,hits_flt,pageviews_flt,visitNumber_flt,transactionRevenue_flt
1.0,1.0,1.0,1.0,1.0,1.0,0.0
2.0,3.0,1.0,4.0,4.0,6.0,0.0
1.0,3.0,1.0,9.0,9.0,6.0,0.0
1.0,1.0,1.0,1.0,1.0,1.0,0.0
1.0,1.0,1.0,1.0,1.0,1.0,0.0
9.0,10.0,1.0,18.0,17.0,55.0,0.0
1.0,1.0,1.0,1.0,1.0,1.0,0.0
1.0,1.0,1.0,1.0,1.0,1.0,0.0
0.0,2.0,1.0,9.0,8.0,3.0,0.0
1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [17]:
describe_train_df = train_.describe()

display(describe_train_df)

summary,bounces_flt,visits_flt,newVisits_flt,hits_flt,pageviews_flt,visitNumber_flt,transactionRevenue_flt
count,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0
mean,0.630986870017797,1.26532449693139,0.9844476151936452,5.816111637754195,4.870661904008446,2.8658297008962887,2156458.139342759
stddev,0.9297667589287714,1.430618059751293,0.1391503098576567,18.41700775080243,14.128870279787064,127.25578806674643,108388329.27307674
min,0.0,1.0,0.0,1.0,0.0,1.0,0.0
max,186.0,278.0,3.0,4995.0,4160.0,63315.0,77113430000.0


# Normalizing features 
  - x = value 
  - dl = min of attribute 
  - dh = max of attribute 
  - nl = min of expected range 
  - nh = max of expected range

In [19]:
# call function
#normalize columns
def normalizing_column(c , dL, dH):
  nL = 0
  nH = 1
  numi = (float(c) - dL) * (nH-nL)
  denom = dH - dL
  div = float(numi) / float(denom)
  normalized = float(div + nL)
  return normalized

normalizing_column_udf = udf(normalizing_column, DoubleType())


# names = train_.schema.names
names = ['hits_flt',
 'pageviews_flt',
 'visitNumber_flt',
 'bounces_flt',
 'visits_flt',
 'newVisits_flt',
        ]
for colname in names:
  dL = float(describe_train_df.collect()[3][colname])
  dH = float(describe_train_df.collect()[4][colname])
  train_ = train_.withColumn('normalized_' + str(colname), 
                           normalizing_column_udf(colname, lit(dL) , lit(dH))
                          )    

In [20]:
display(train_.describe())

summary,bounces_flt,visits_flt,newVisits_flt,hits_flt,pageviews_flt,visitNumber_flt,transactionRevenue_flt,normalized_hits_flt,normalized_pageviews_flt,normalized_visitNumber_flt,normalized_bounces_flt,normalized_visits_flt,normalized_newVisits_flt
count,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0,714167.0
mean,0.630986870017797,1.26532449693139,0.9844476151936452,5.816111637754195,4.870661904008446,2.8658297008962887,2156458.139342759,0.0009643795830505168,0.0011708321884636,2.946946490343825e-05,0.0033924025269775,0.000957850169427408,0.3281492050645521
stddev,0.9297667589287706,1.430618059751293,0.1391503098576567,18.41700775080244,14.12887027978706,127.25578806674658,108388329.27307698,0.0036878269424914,0.0033963630480257,0.0020099154699868,0.004998746015746,0.0051646861362862,0.0463834366192189
min,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,186.0,278.0,3.0,4995.0,4160.0,63315.0,77113430000.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
selected_train_ = train_.select('transactionRevenue_flt', 'normalized_hits_flt', 'normalized_pageviews_flt', 'normalized_visitNumber_flt', 'normalized_bounces_flt', 'normalized_visits_flt', 'normalized_newVisits_flt')
display(selected_train_)

transactionRevenue_flt,normalized_hits_flt,normalized_pageviews_flt,normalized_visitNumber_flt,normalized_bounces_flt,normalized_visits_flt,normalized_newVisits_flt
0.0,0.0004004805766920304,0.0007211538461538462,0.0,0.0,0.0,0.3333333333333333
0.0,0.0004004805766920304,0.0004807692307692308,0.0,0.0,0.0,0.3333333333333333
0.0,0.0126151381657989,0.0125,0.00014214865590548694,0.0053763440860215,0.0108303249097472,0.3333333333333333
0.0,0.0026031237484981,0.0033653846153846,0.0,0.0,0.0,0.3333333333333333
0.0,0.0102122547056467,0.009375,0.0,0.0,0.0,0.3333333333333333
0.0,0.0,0.0002403846153846154,0.0,0.0053763440860215,0.0,0.3333333333333333
0.0,0.00100120144173,0.001201923076923,0.0,0.0,0.0,0.3333333333333333
0.0,0.001201441730076,0.0016826923076923,0.0,0.0,0.0,0.3333333333333333
0.0,0.0028033640368442,0.003125,3.158859020121932e-05,0.0,0.003610108303249,0.3333333333333333
0.0,0.0004004805766920304,0.0007211538461538462,0.0,0.0,0.0,0.3333333333333333


In [22]:
train_pd = selected_train_.toPandas()

In [23]:
train_pd.dtypes

In [24]:
train_x = train_pd.drop(['transactionRevenue_flt'], axis = 1)
n_cols = train_x.shape[1]
train_x.head()

In [25]:
train_y2 = train_pd.transactionRevenue_flt
train_y2.head()

In [26]:
train_y2_log = np.log(train_y2)
train_y2_log.head()

In [27]:
train_y2 = train_y2_log.replace('-inf', 0.0)
train_y2.describe()
# train_y2.describe()

In [29]:
# define
model = Sequential()

#add model layers
model.add(Dense(7, activation='relu', input_shape=(n_cols,)))
model.add(Dense(7, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

#7 input model - mse ---- loss = 5.9617 , after new log --> loss =  3.1654

In [30]:
# from keras.callbacks import EarlyStopping
#set early stopping monitor so the model stops training when it won't improve anymore
# early_stopping_monitor = EarlyStopping(patience=3)
#train model
model.fit(train_x, train_y2, epochs=10)

In [31]:
# define
model2 = Sequential()

#add model layers
model2.add(Dense(200, activation='relu', input_shape=(n_cols,)))
model2.add(Dense(200, activation='relu'))
model2.add(Dense(200, activation='relu'))
model2.add(Dense(1))

model2.compile(loss='mean_squared_error', optimizer='adam')
model2.fit(train_x, train_y2, epochs=10)

#200 input model - mse ---- loss = 6.1056 , after new log --> loss = 3.1392

In [32]:
# define
model3 = Sequential()

#add model layers
model3.add(Dense(7, activation='relu', input_shape=(n_cols,)))
model3.add(Dense(7, activation='relu'))
model3.add(Dense(1))



#create a custome loss and compile model using mse as a measure of model performance
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

model3.compile(optimizer = "adam", loss = root_mean_squared_error)


from keras.callbacks import EarlyStopping
#set early stopping monitor so the model stops training when it won't improve anymore
# early_stopping_monitor = EarlyStopping(patience=3)
#train model
model3.fit(train_x, train_y2, epochs=5)

#loss: 0.2492