In [6]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')
findspark.find()

'/usr/hdp/current/spark2-client'

In [7]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn").appName("ch03SparkToolSet").getOrCreate()

In [8]:
sc = spark.sparkContext
sc._jsc.sc().uiWebUrl().get()

'http://gw02.itversity.com:4053'

In [9]:
for x in sc._conf.getAll():
    if 'PROXY' in x[0]:
        print(x[1])
print(spark.conf.get('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES'))
print(spark.conf.get('spark.driver.appUIAddress'))

rm01.itversity.com
http://rm01.itversity.com:19288/proxy/application_1533622723243_6152
http://rm01.itversity.com:19288/proxy/application_1533622723243_6152
http://gw02.itversity.com:4053


In [10]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/user/kranthidr/spark-guide/data/retail-data/by-day/*.csv")

AnalysisException: 'Path does not exist: hdfs://nn01.itversity.com:8020/user/kranthidr/spark-guide/data/retail-data/by-day/*.csv;'

In [None]:
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [None]:
staticDataFrame.printSchema()

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [None]:
staticDataFrame.show()

In [None]:
sc.version

In [None]:
# COMMAND ----------
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .show(5)

In [None]:
# COMMAND ----------
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("/user/kranthidr/spark-guide/data/retail-data/by-day/*.csv")

In [None]:
# COMMAND ----------
purchaseByCustomerPerHour = streamingDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")

In [None]:
# COMMAND ----------
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

In [None]:
# COMMAND ----------

spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(20)

In [None]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `window` DESC
  """)\
  .show(20)

In [None]:
purchaseByCustomerPerHour.writeStream\
    .format("console")\
    .queryName("customer_purchases_2")\
    .outputMode("complete")\
    .start()

In [None]:
# COMMAND ----------

from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)

In [None]:
preppedDataFrame.show()

In [None]:
# COMMAND ----------

trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")

In [None]:
# COMMAND ----------
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
  .setInputCol("day_of_week")\
  .setOutputCol("day_of_week_index")

In [None]:
# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

In [None]:
# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")

In [None]:
# COMMAND ----------

from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
  .setStages([indexer, encoder, vectorAssembler])

In [None]:
# COMMAND ----------

fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [None]:
# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)

In [None]:
transformedTraining.cache()

In [None]:
# COMMAND ----------

from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1)

In [None]:
# COMMAND ----------

kmModel = kmeans.fit(transformedTraining)

In [None]:
# COMMAND ----------

transformedTest = fittedPipeline.transform(testDataFrame)

In [None]:
transformedTest.printSchema()

In [None]:
kmModel.computeCost(transformedTest)

In [None]:
kmModel.computeCost(transformedTraining)