In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Basic_DF_EX2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
df = spark.read.format("csv").option("header", "true") \
       .load("/home/jovyan/data/online-retail-dataset.csv")
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



### What is the average quantity bought by the customer 14769?

**collect** function https://sparkbyexamples.com/pyspark/pyspark-collect/

In [2]:
from pyspark.sql.functions import col, avg
# 1 
print(df.where(df.CustomerID == "14769").agg(avg(col("Quantity"))).collect())
print(df.where(df.CustomerID == "14769").agg(avg(col("Quantity"))).collect()[0][0])   # first row, first column

# 2
print(df.where(df.CustomerID == "14769").agg({"Quantity": "avg"}).collect()[0][0])

[Row(avg(Quantity)=6.769652650822669)]
6.769652650822669
6.769652650822669


### What is the most occurring word in the items bought by the customers from France?

In [3]:
from pyspark.sql.functions import explode, split, concat, col, lit, desc

words = df.where(df.Country == "France").select("Description").select(
            explode(
                split(col("Description"), " ")
            ).alias("word")
        )
words.show(10)
ordered_word_count = words.groupby(words.word).count().orderBy(col("count").desc())
ordered_word_count.show(3)
# https://sparkbyexamples.com/pyspark/pyspark-collect/
print(ordered_word_count.collect())
print("\n")
print(ordered_word_count.collect()[1][0])   # second row, first column

+--------+
|    word|
+--------+
|   ALARM|
|   CLOCK|
|BAKELIKE|
|    PINK|
|   ALARM|
|   CLOCK|
|BAKELIKE|
|     RED|
|        |
|   ALARM|
+--------+
only showing top 10 rows

+----+-----+
|word|count|
+----+-----+
|    | 2465|
| RED| 1189|
| SET| 1128|
+----+-----+
only showing top 3 rows

[Row(word='', count=2465), Row(word='RED', count=1189), Row(word='SET', count=1128), Row(word='RETROSPOT', count=1013), Row(word='BAG', count=1006), Row(word='OF', count=958), Row(word='LUNCH', count=645), Row(word='DESIGN', count=600), Row(word='SPACEBOY', count=495), Row(word='BOX', count=483), Row(word='VINTAGE', count=481), Row(word='PINK', count=415), Row(word='PAPER', count=388), Row(word='WOODLAND', count=365), Row(word='TIN', count=356), Row(word='IN', count=352), Row(word='DOLLY', count=349), Row(word='CHILDRENS', count=337), Row(word='PACK', count=337), Row(word='GIRL', count=328), Row(word='MINI', count=317), Row(word='POSTAGE', count=311), Row(word='POLKADOT', count=310), Row(word='C

In [4]:
# Stop the spark context
spark.stop()