- analyze a Google Merchandise Store (also known as GStore, where Google swag is sold) customer dataset to predict revenue per customer. 
- we're predicting the natural log of the total revenue per unique user

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
import pandas as pd
import numpy as np
import json

In [4]:
# File location and type
file_location = "/FileStore/tables/train.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option('quote', '"') \
  .option('escape', '"') \
  .load(file_location)

display(train)

In [5]:
# get the length of all json columns 
def udf_calculate_json_len(col):
  j_col = json.loads(col)
  return len(j_col)
udf_calculate_json_len = udf(udf_calculate_json_len, IntegerType())



display(
  train.select('device', udf_calculate_json_len('device').alias('len')).sort(desc('len'))
)

In [6]:

geoNetworkSchema = StructType(
  [
    StructField("continent", StringType()),
    StructField("subContinent", StringType()),
    StructField("country", StringType()),
    StructField("region", StringType()),
    StructField("metro", StringType()),
    StructField("city", StringType()),
    StructField("cityId", StringType()),
    StructField("networkDomain", StringType()),
    StructField("latitude", StringType()),
    StructField("longitude", StringType()),
    StructField("networkLocation", StringType()),
  ]
)


deviceSchema = StructType(
  [
    StructField("browser", StringType()),
    StructField("browserVersion", StringType()),
    StructField("browserSize", StringType()),
    StructField("operatingSystem", StringType()),
    StructField("operatingSystemVersion", StringType()),
    StructField("isMobile", StringType()),
    StructField("mobileDeviceBranding", StringType()),
    StructField("mobileDeviceModel", StringType()),
    StructField("mobileInputSelector", StringType()),
    StructField("mobileDeviceInfo", StringType()),
    StructField("mobileDeviceMarketingName", StringType()),
    StructField("flashVersion", StringType()),
    StructField("language", StringType()),
    StructField("screenColors", StringType()),
    StructField("screenResolution", StringType()),
    StructField("deviceCategory", StringType()),
  ]
)

In [7]:
totals_schema = StructType([ StructField("visits", StringType(), True),
                     StructField("hits", StringType(), True),
                     StructField("pageviews", StringType(), True),
                     StructField("bounces", StringType(), True),
                     StructField("transactionRevenue", StringType(), True),
                     StructField("newVisits", StringType(), True)
                        ])


trafficSource_schema = StructType(
  [
    StructField("campaign", StringType(), True),
    StructField("source", StringType(), True),
    StructField("medium", StringType(), True),
    StructField("keyword", StringType(), True),
    StructField("adContent", StringType(), True),
    StructField("adwordsClickInfo", StructType(
      [
        StructField('page', StringType() , True),
        StructField('slot', StringType() , True),
        StructField('criteriaParameters', StringType() , True),
        StructField('gclId', StringType() , True),
        StructField('adNetworkType', StringType() , True),
        StructField('isVideoAd', StringType() , True),
      ]
    ), True),
    StructField('isTrueDirect', StringType() , True)
  ]
)

In [8]:
# train2 = train.withColumn("_totals", from_json(train["totals"], totals_schema))
train = train.withColumn("_trafficSource", from_json(train["trafficSource"], trafficSource_schema)
).withColumn("_totals", from_json(train["totals"], totals_schema
)).withColumn("_device", from_json(train["device"], deviceSchema
)).withColumn("_geoNetwork", from_json(train["geoNetwork"], geoNetworkSchema
))
                        
display(train)                     

In [9]:
train_exploded = train.select('fullVisitorId','channelGrouping', 
                         'date', 
                         col('_device.*'),
                         'fullVisitorId',
                         col('_geoNetwork.*'),
                         'sessionId',
                         'socialEngagementType',
                         col('_totals.*'),
                         col('_trafficSource.*'),
                         'visitId',
                         'visitNumber',
                         'visitStartTime'
                             ) 

display(train_exploded)

In [10]:
print("# of row" , train_exploded.count())
print("# of cols" , len(train_exploded.columns))

In [11]:
selected_features = ['fullVisitorId','bounces', 'visits', 'newVisits', 'hits', 'pageviews' , 'visitNumber', 'transactionRevenue']

train_selected = train_exploded.select(selected_features)
  
#  'channelGrouping',
#  'date',
#  'fullVisitorId',
#  'sessionId',
#  'visitId',
#  'visitNumber',
#  'visitStartTime',
#  'browser',
#  'deviceCategory',
#  'isMobile',
#  'operatingSystem',
#  'city',
#  'continent',
#  'country',
#  'metro',
#  'networkDomain',
#  'region',
#  'subContinent',
#  'bounces',
#  'hits',
#  'visits',
#  'newVisits',
#  'pageviews',
#  'transactionRevenue',
#  'adContent',
#  'campaign',
#  'isTrueDirect',
#  'keyword',
#  'medium',
#  'source')

print("# of row" , train_selected.count())
print("# of cols" , len(train_selected.columns))

In [12]:
# train_fillna_revenue = train_selected.select("transactionRevenue").fillna(0.0)
# df.fillna( { 'a':0, 'b':0 } )

train_fillna = train_selected.fillna( { "transactionRevenue":0 , 
                             "visitNumber" : 0 , 
                             "bounces" : 0 , 
                             "visits" : 0 , 
                             "newVisits" : 0 , 
                             "hits" : 0 , 
                             "pageviews" : 0
                            } )

train_fillna.filter(col("transactionRevenue").isNull()).count()

In [13]:
train_convert_to_double = train_fillna.withColumn("visitNumber_flt", col("visitNumber").cast("double")
).withColumn("bounces_flt", col("bounces").cast("double")
).withColumn("visits_flt", col("visits").cast("double")
).withColumn("newVisits_flt", col("newVisits").cast("double")
).withColumn("hits_flt", col("hits").cast("double")
).withColumn("pageviews_flt", col("pageviews").cast("double")
).withColumn("transactionRevenue_flt", col("transactionRevenue").cast("double"))

train_convert_to_double.printSchema()

In [14]:
train_groupby = train_convert_to_double.groupby('fullVisitorId').sum()

train_groupby_renamed = train_groupby.withColumnRenamed("sum(visitNumber_flt)", "visitNumber_flt"
).withColumnRenamed("sum(bounces_flt)", "bounces_flt"
).withColumnRenamed("sum(bounces_flt)", "bounces_flt"
).withColumnRenamed("sum(visits_flt)", "visits_flt"
).withColumnRenamed("sum(newVisits_flt)", "newVisits_flt"                   
).withColumnRenamed("sum(hits_flt)", "hits_flt"                      
).withColumnRenamed("sum(pageviews_flt)", "pageviews_flt"                     
).withColumnRenamed("sum(transactionRevenue_flt)", "transactionRevenue_flt")                       
                    
display(train_groupby_renamed)

In [15]:
# categorical_feature_encoded = ['channelGroupingIndex', 'browserIndex', 'deviceCategoryIndex', 'isMobileIndex' , 'countryIndex' , 'mediumIndex', 'sourceIndex']
# categorical_feature_encoded = ['channelGrouping', 'browser', 'deviceCategory', 'isMobile' , 'country' , 'medium', 'source']

numerical_features = ['bounces_flt', 'visits_flt', 'newVisits_flt', 'hits_flt', 'pageviews_flt' , 'visitNumber_flt']

# feature_columns = categorical_feature_encoded + numerical_features 
feature_columns = numerical_features

label_column = ['transactionRevenue_flt']


In [16]:
train_ = train_groupby_renamed.select(feature_columns + label_column)
display(train_)

In [17]:
describe_train_df = train_.describe()

display(describe_train_df)

# Normalizing features 
  - x = value 
  - dl = min of attribute 
  - dh = max of attribute 
  - nl = min of expected range 
  - nh = max of expected range

In [19]:
# call function
#normalize columns
def normalizing_column(c , dL, dH):
  nL = 0
  nH = 1
  numi = (float(c) - dL) * (nH-nL)
  denom = dH - dL
  div = float(numi) / float(denom)
  normalized = float(div + nL)
  return normalized

normalizing_column_udf = udf(normalizing_column, DoubleType())


# names = train_.schema.names
names = ['hits_flt',
 'pageviews_flt',
 'visitNumber_flt',
 'bounces_flt',
 'visits_flt',
 'newVisits_flt',
        ]
for colname in names:
  dL = float(describe_train_df.collect()[3][colname])
  dH = float(describe_train_df.collect()[4][colname])
  train_ = train_.withColumn('normalized_' + str(colname), 
                           normalizing_column_udf(colname, lit(dL) , lit(dH))
                          )    

In [20]:
display(train_.describe())

In [21]:
selected_train_ = train_.select('transactionRevenue_flt', 'normalized_hits_flt', 'normalized_pageviews_flt', 'normalized_visitNumber_flt', 'normalized_bounces_flt', 'normalized_visits_flt', 'normalized_newVisits_flt')
display(selected_train_)

In [22]:
train_pd = selected_train_.toPandas()

In [23]:
train_pd.dtypes

In [24]:
train_x = train_pd.drop(['transactionRevenue_flt'], axis = 1)
n_cols = train_x.shape[1]
train_x.head()

In [25]:
train_y2 = train_pd.transactionRevenue_flt
train_y2.head()

In [26]:
train_y2_log = np.log(train_y2)
train_y2_log.head()

In [27]:
train_y2 = train_y2_log.replace('-inf', 0.0)
train_y2.describe()
# train_y2.describe()

In [28]:
# define
model = Sequential()

#add model layers
model.add(Dense(7, activation='relu', input_shape=(n_cols,)))
model.add(Dense(7, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

#7 input model - mse ---- loss = 5.9617 , after new log --> loss =  3.1654

In [29]:
# from keras.callbacks import EarlyStopping
#set early stopping monitor so the model stops training when it won't improve anymore
# early_stopping_monitor = EarlyStopping(patience=3)
#train model
model.fit(train_x, train_y2, epochs=10)

In [30]:
# define
model2 = Sequential()

#add model layers
model2.add(Dense(200, activation='relu', input_shape=(n_cols,)))
model2.add(Dense(200, activation='relu'))
model2.add(Dense(200, activation='relu'))
model2.add(Dense(1))

model2.compile(loss='mean_squared_error', optimizer='adam')
model2.fit(train_x, train_y2, epochs=10)

#200 input model - mse ---- loss = 6.1056 , after new log --> loss = 3.1392

In [31]:
# define
model3 = Sequential()

#add model layers
model3.add(Dense(7, activation='relu', input_shape=(n_cols,)))
model3.add(Dense(7, activation='relu'))
model3.add(Dense(1))



#create a custome loss and compile model using mse as a measure of model performance
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

model3.compile(optimizer = "adam", loss = root_mean_squared_error)


from keras.callbacks import EarlyStopping
#set early stopping monitor so the model stops training when it won't improve anymore
# early_stopping_monitor = EarlyStopping(patience=3)
#train model
model3.fit(train_x, train_y2, epochs=5)

#loss: 0.2492