Exercise 4: DataFrame exercise (Simple and Grouping)

In [40]:
import pyspark
from pyspark.sql import SparkSession

In [41]:
spark = SparkSession.builder \
    .appName("Spark SQL exercise 4") \
    .master("local[*]") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001E9324C7190>


In [42]:
rdd = spark.sparkContext.parallelize([
    (536378, "NULL", "PACK OF 60 DINOSAUR CAKE CASES", 24, "01-12-2010", 0.55, 17850, "United Kingdom"),
    (536378, "NULL", "PACK OF 72 SKULL CAKE CASES", 24, "01-12-2010", 0.55, 17850, "United States"),
    (536378, "84991A", "72 SWEETHEART FAIRY CAKE CASES", 120, "01-12-2010", 0.55, 17850, "United Kingdom"),
    (536378, "84992B", "72 RETROSPOT TEA SET CERAMIC HEART", 120, "01-12-2010", 0.55, 17850, "United Kingdom"),
    (536378, "84993C", "60 TEATIME FAIRY CAKE CASES", 120, "01-12-2010", 0.55, 17850, "United Kingdom"),
    (536378, "84994", "60 CAKE CASES VINTAGE CHRISTMAS", 120, "01-12-2010", 0.55, 17850, "United States"),
    (536381, "22727", "ALARM CLOCK BAKELIKE PINK", 24, "01-12-2010", 3.75, 15311, "United Kingdom"),
    (536381, "22726", "ALARM CLOCK BAKELIKE RED", 24, "01-12-2010", 3.75, 15311, "Germany"),
    (536381, "22730", "ALARM CLOCK BAKELIKE IVORY", 24, "01-12-2010", 3.75, 15311, "Germany"),
    (536381, "22367", "CHILDRENS APRON SPACEBOY DESIGN", 8, "01-12-2010", 1.95, 15311, "United Kingdom"),
    (536381, "22629", "SPACEBOY LUNCH BOX", 12, "01-12-2010", 1.95, 15311, "Austria"),
    (536381, "22659", "LUNCH BOX I LOVE LONDON", 12, "01-12-2010", 1.95, 15311, "United Kingdom"),
    (536381, "22631", "CIRCUS PARADE LUNCH BOX", 12, "01-12-2010", 1.95, 15311, "Switzerland")
])

In [43]:
from pyspark.sql import Row
df = rdd.map(lambda x: Row(InvoiceNo=x[0], StockCode=x[1], Description=x[2], Quantity=x[3], InvoiceDate=x[4], UnitPrice=x[5], CustomerID=x[6], Country=x[7]))

In [44]:
# Create its Schema
from pyspark.sql.types import *
schema = StructType([
    StructField("InvoiceNo", IntegerType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)
])

In [45]:
# Create DataFrame
df = spark.createDataFrame(df, schema)

In [46]:
df.show(5)

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536378|     NULL|PACK OF 60 DINOSA...|      24| 01-12-2010|     0.55|     17850|United Kingdom|
|   536378|     NULL|PACK OF 72 SKULL ...|      24| 01-12-2010|     0.55|     17850| United States|
|   536378|   84991A|72 SWEETHEART FAI...|     120| 01-12-2010|     0.55|     17850|United Kingdom|
|   536378|   84992B|72 RETROSPOT TEA ...|     120| 01-12-2010|     0.55|     17850|United Kingdom|
|   536378|   84993C|60 TEATIME FAIRY ...|     120| 01-12-2010|     0.55|     17850|United Kingdom|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
only showing top 5 rows



In [47]:
# Update InvoiceDate column to DateType
from pyspark.sql.functions import to_date
df1 = df.withColumn("InvoiceDate", to_date(df["InvoiceDate"], "dd-MM-yyyy")).show(5)

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536378|     NULL|PACK OF 60 DINOSA...|      24| 2010-12-01|     0.55|     17850|United Kingdom|
|   536378|     NULL|PACK OF 72 SKULL ...|      24| 2010-12-01|     0.55|     17850| United States|
|   536378|   84991A|72 SWEETHEART FAI...|     120| 2010-12-01|     0.55|     17850|United Kingdom|
|   536378|   84992B|72 RETROSPOT TEA ...|     120| 2010-12-01|     0.55|     17850|United Kingdom|
|   536378|   84993C|60 TEATIME FAIRY ...|     120| 2010-12-01|     0.55|     17850|United Kingdom|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
only showing top 5 rows



In [48]:
#1: count total rows
df.count()

13

In [49]:
#2: total number of invoices
df.select("InvoiceNo").distinct().count()

2

In [None]:
#1,2a
from pyspark.sql.functions import *
df.select(count("*").alias("TotalRows"), countDistinct("InvoiceNo").alias("TotalInvoices"),
          sum("Quantity").alias("Total_Quantity"),
          round(avg("UnitPrice"),1).alias("Avg_Price")).show()

+---------+-------------+--------------+---------+
|TotalRows|TotalInvoices|Total_Quantity|Avg_Price|
+---------+-------------+--------------+---------+
|       13|            2|           644|      1.7|
+---------+-------------+--------------+---------+



In [63]:
#1,2b
df.selectExpr("count(*) as count_rows",
              "count(distinct (InvoiceNo)) as count_distinct").show()

+----------+--------------+
|count_rows|count_distinct|
+----------+--------------+
|        13|             2|
+----------+--------------+



In [50]:
#3: total number of quantity sold
from pyspark.sql.functions import *
df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|          644|
+-------------+



In [51]:
#4: average unit price
df.select(avg("UnitPrice")).show()

+-----------------+
|   avg(UnitPrice)|
+-----------------+
|1.719230789404649|
+-----------------+



In [52]:
#4: second way
df.createOrReplaceTempView("sales")
spark.sql("""
SELECT sum(Quantity) as TotalQuantity, avg(UnitPrice) as AverageUnitPrice
FROM sales
""").show()

+-------------+-----------------+
|TotalQuantity| AverageUnitPrice|
+-------------+-----------------+
|          644|1.719230789404649|
+-------------+-----------------+



In [53]:
#4: third way
df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice")).select(sum("TotalPrice")).show()

+-----------------+
|  sum(TotalPrice)|
+-----------------+
|646.2000064849854|
+-----------------+



Grouping Exercise

In [54]:
#1: Total quantity sold by each country
df1 = df.groupBy("Country").sum("Quantity").show()

+--------------+-------------+
|       Country|sum(Quantity)|
+--------------+-------------+
|United Kingdom|          428|
| United States|          144|
|       Germany|           48|
|       Austria|           12|
|   Switzerland|           12|
+--------------+-------------+



In [55]:
#1: Second way
df1 = df.groupBy("Country").agg(sum("Quantity").alias("TotalQuantity")).show()

+--------------+-------------+
|       Country|TotalQuantity|
+--------------+-------------+
|United Kingdom|          428|
| United States|          144|
|       Germany|           48|
|       Austria|           12|
|   Switzerland|           12|
+--------------+-------------+



In [56]:
#1: Third way
spark.sql("""
SELECT Country, sum(Quantity) as TotalQuantity
FROM sales
GROUP BY Country
""").show()

+--------------+-------------+
|       Country|TotalQuantity|
+--------------+-------------+
|United Kingdom|          428|
| United States|          144|
|       Germany|           48|
|       Austria|           12|
|   Switzerland|           12|
+--------------+-------------+



In [None]:
#1: Fourth way (like Second way but using ALIAS() also)
df.groupBy("Country") \
.agg(sum("Quantity").alias("total_quant"),
     round(sum(df.Quantity * df.UnitPrice),1).alias("total_amount")) \
     .show()

+--------------+-----------+------------+
|       Country|total_quant|total_amount|
+--------------+-----------+------------+
|United Kingdom|        428|       340.2|
| United States|        144|        79.2|
|       Germany|         48|       180.0|
|       Austria|         12|        23.4|
|   Switzerland|         12|        23.4|
+--------------+-----------+------------+



In [68]:
from pyspark.sql.functions import expr

#1: Fifth way

df.groupBy("Country") \
    .agg(expr("sum(Quantity) as total_quantity"),
         expr("round(sum(Quantity * UnitPrice),2) as total_amount1")) \
    .show()

+--------------+--------------+-------------+
|       Country|total_quantity|total_amount1|
+--------------+--------------+-------------+
|United Kingdom|           428|        340.2|
| United States|           144|         79.2|
|       Germany|            48|        180.0|
|       Austria|            12|         23.4|
|   Switzerland|            12|         23.4|
+--------------+--------------+-------------+



In [57]:
#2: Total price sold by each country
df1=df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice")).groupBy("Country").sum("TotalPrice").show()

+--------------+------------------+
|       Country|   sum(TotalPrice)|
+--------------+------------------+
|United Kingdom| 340.2000026702881|
| United States| 79.20000076293945|
|       Germany|             180.0|
|       Austria|23.400001525878906|
|   Switzerland|23.400001525878906|
+--------------+------------------+



In [58]:
#2: Second way
df.select(col("Country"), (col("Quantity") * col("UnitPrice")).alias("TotalPrice")).groupBy("Country").sum("TotalPrice").show()

+--------------+------------------+
|       Country|   sum(TotalPrice)|
+--------------+------------------+
|United Kingdom| 340.2000026702881|
| United States| 79.20000076293945|
|       Germany|             180.0|
|       Austria|23.400001525878906|
|   Switzerland|23.400001525878906|
+--------------+------------------+



In [59]:
#3: Third way
spark.sql("""
SELECT Country, sum(Quantity * UnitPrice) as TotalPrice
FROM sales
GROUP BY Country
""").show()

+--------------+------------------+
|       Country|        TotalPrice|
+--------------+------------------+
|United Kingdom| 340.2000026702881|
| United States| 79.20000076293945|
|       Germany|             180.0|
|       Austria|23.400001525878906|
|   Switzerland|23.400001525878906|
+--------------+------------------+

