The purpose of this notebook is to practice most common data exloration techniques on the `bakehouse` sample data

In [0]:
import plotly.express as px
import pyspark.sql.functions as F 
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import to_timestamp,col,lit
from pyspark.sql import Window

1: sales_customers

In [0]:
#Loading data
spark.sql("DESCRIBE DETAIL samples.bakehouse.sales_customers").select("format", "numFiles", "location").collect()
#format='delta' means it's a delta table

In [0]:
df_sales_customers = spark.read.table("samples.bakehouse.sales_customers") #using read.table as it's a delta table

In [0]:
# Printing schema and 3 rows in a Pandas-style DataFrame
df_sales_customers.printSchema()
df_sales_customers.limit(3).toPandas()

In [0]:
# count,mean,stddev,min,max
df_sales_customers.describe().toPandas()

In [0]:
#count distinct values for each field of a dataFrame
df_sales_customers.select([countDistinct(col(c)).alias(c) for c in df_sales_customers.columns]).show()

# Nulls per column
df_sales_customers.select([F.count(F.when(col(c).isNull(), c)).alias(c) for c in df_sales_customers.columns]).show()

In [0]:
# Let's see how many rows we have per gender/continen/country/state/city granularity
df_sales_customers.groupBy("gender","continent","country","state","city").count().orderBy("count",ascending=False).show(3)

# Wow, just one customer per one city, that's strange
df_sales_customers.filter(df_sales_customers["city"] == "Jamesville").toPandas()

In [0]:
# Alright, let's see how many rows we have per state granularity, w/o city
df_sales_customers.groupBy("gender","continent","country","state").count().orderBy("count",ascending=False).show(3)

In [0]:
#Let's see male/female count per country in a table form
df_sales_customers_grouped = df_sales_customers.groupBy("gender","country").count()
df_sales_customers_grouped_toPandas = df_sales_customers_grouped.orderBy("count", ascending=False).toPandas()
df_sales_customers_grouped_toPandas

In [0]:
#Let's visualise the same data in a stacked bar chart
counties_gender_barchart = px.bar(df_sales_customers_grouped_toPandas, x="country", y="count", color="gender", 
             title="Countries by Customer Count (Stacked by Gender)", 
             barmode="stack")
counties_gender_barchart.show()

In [0]:
#Let's explore most popular male and female names 
top_male_names = (
    df_sales_customers
    .filter("gender = 'male'")
    .groupBy("gender","first_name")
    .count()
    .orderBy("count",ascending=False)
    .limit(3)
)

top_female_names = (
    df_sales_customers
    .filter("gender = 'female'")
    .groupBy("gender","first_name")
    .count()
    .orderBy("count",ascending=False)
    .limit(3)
)

display(top_male_names.union(top_female_names))
# Mathew is female, Jennifer is make - looks like data has been randomly generated w/o proper sense

2: sales_suppliers

In [0]:
df_sales_suppliers = spark.read.table("samples.bakehouse.sales_suppliers")
df_sales_suppliers.printSchema()
df_sales_suppliers.limit(3).toPandas()


In [0]:
# count,mean,stddev,min,max
df_sales_suppliers.describe().toPandas()

In [0]:
# count distinct values for each field of a dataFrame
df_sales_suppliers.select([countDistinct(col(c)).alias(c) for c in df_sales_suppliers.columns]).show()

# Nulls per column
df_sales_suppliers.select([F.count(F.when(col(c).isNull(), c)).alias(c) for c in df_sales_suppliers.columns]).show()

In [0]:
df_sales_suppliers.groupBy("ingredient").count().orderBy("count", ascending=False).limit(5).show()
# only one supplier per one ingredient

In [0]:
df_sales_suppliers.groupBy("continent").count().orderBy("count", ascending=False).show()
# most ingredients are from Asia

3: sales_franchises

In [0]:
df_sales_franchises = spark.read.table("samples.bakehouse.sales_franchises")
df_sales_franchises.printSchema()
df_sales_franchises.limit(3).toPandas()

In [0]:
# count,mean,stddev,min,max
df_sales_franchises.describe().show()

In [0]:
# count distinct values for each field of a dataFrame
df_sales_franchises.select([countDistinct(col(c)).alias(c) for c in df_sales_franchises.columns]).show()

# Nulls per column
df_sales_franchises.select([F.count(F.when(col(c).isNull(), c)).alias(c) for c in df_sales_franchises.columns]).show()

In [0]:
# explore L, XL, XXL size franchises
(
df_sales_franchises 
    .where("size NOT IN ('S', 'M')") # excluding S and M 
    .groupBy("country", "size") 
    .count() 
    .orderBy("country", "size", ascending=False) 
    .show()
)

In [0]:
# the same results, but in a stacked bar chart
df_sales_franchises_grouped_toPandas = (
df_sales_franchises 
    .where("size NOT IN ('S', 'M')") # excluding S and M 
    .groupBy("country", "size") 
    .count() 
    .withColumn("countPerCountry",F.sum("count").over(Window.partitionBy("country"))) #WFunction to count total per country
    .orderBy("countPerCountry", ascending=False) 
    .toPandas()
)
counties_size_barchart = px.bar(df_sales_franchises_grouped_toPandas, x="country", y="count", color="size", 
             title="Countries by Francise Count (Stacked by Size)", 
             barmode="stack")
counties_size_barchart.show()

In [0]:
# cities with more than one franchise and their countries 
(
    df_sales_franchises
        .groupBy("country", "city")
        .count()
        .where("count > 1")
        .orderBy("country","count", ascending=False)
        .show()
)

df_sales_franchises.where("city = 'Seattle'").show()
# indeed, there are two franchise points located in Seattle

4: sales_transactions

In [0]:
sales_transactions = spark.read.table("samples.bakehouse.sales_transactions")
sales_transactions.printSchema()
sales_transactions.limit(3).toPandas()

In [0]:
# count distinct values for each field of a dataFrame
sales_transactions.select([countDistinct(col(c)).alias(c) for c in sales_transactions.columns]).show()

# Nulls per column
sales_transactions.select([F.count(F.when(col(c).isNull(), (c))).alias(c) for c in sales_transactions.columns]).show()

In [0]:
from pyspark.sql.functions import sum

( # sales by product
    sales_transactions
        .groupBy("product", "unitPrice")
        .agg(
            sum("totalPrice").alias("sum_totalPrice"),
            sum("quantity").alias("sum_quantity")
        )
        .orderBy("sum_totalPrice", ascending = False)
        .show(truncate = False)
)

( #sales by payment method
    sales_transactions
        .groupBy("paymentMethod")
        .agg(
            sum("totalPrice").alias("sum_totalPrice"),
            sum("quantity").alias("sum_quantity")
        )
        .orderBy("sum_totalPrice", ascending = False)
        .show(truncate = False)
)