The purpose of this notebook is to practice most common data exloration techniques on the `bakehouse` sample data

1: sales_customers

In [0]:
import plotly.express as px

In [0]:
#Loading data
spark.sql("DESCRIBE DETAIL samples.bakehouse.sales_customers").select("format", "numFiles", "location").collect()
#format='delta' means it's a delta table

In [0]:
df_sales_customers = spark.read.table("samples.bakehouse.sales_customers") #using read.table as it's a delta table

In [0]:
# Printing schema and 3 rows in a Pandas-style DataFrame
df_sales_customers.printSchema()
df_sales_customers.limit(3).toPandas()

In [0]:
# Let's see how many rows we have per gender/continen/country/state/city granularity
df_sales_customers.groupBy("gender","continent","country","state","city").count().orderBy("count",ascending=False).show(3)

# Wow, just one customer per one city, that's strange
df_sales_customers.filter(df_sales_customers["city"] == "Jamesville").toPandas()

In [0]:
# Alright, let's see how many rows we have per state granularity, w/o city
df_sales_customers.groupBy("gender","continent","country","state").count().orderBy("count",ascending=False).show(3)

In [0]:
#Let's see male/female count per country in a table form
df_sales_customers_grouped = df_sales_customers.groupBy("gender","country").count()
df_sales_customers_grouped_toPandas = df_sales_customers_grouped.orderBy("count", ascending=False).toPandas()
df_sales_customers_grouped_toPandas

In [0]:
#Let's visualise the same data in a stacked bar chart
counties_gender_barchart = px.bar(df_sales_customers_grouped_toPandas, x="country", y="count", color="gender", 
             title="Countries by Customer Count (Stacked by Gender)", 
             barmode="stack")
counties_gender_barchart.show()

In [0]:
#Let's explore most popular male and female names 
top_male_names = (
    df_sales_customers
    .filter("gender = 'male'")
    .groupBy("gender","first_name")
    .count()
    .orderBy("count",ascending=False)
    .limit(3)
)

top_female_names = (
    df_sales_customers
    .filter("gender = 'female'")
    .groupBy("gender","first_name")
    .count()
    .orderBy("count",ascending=False)
    .limit(3)
)

display(top_male_names.union(top_female_names))
# Mathew is female, Jennifer is make - looks like data has been randomly generated w/o proper sense

2: sales_suppliers

In [0]:
df_sales_suppliers = spark.read.table("samples.bakehouse.sales_suppliers")
df_sales_suppliers.printSchema()
df_sales_suppliers.limit(3).toPandas()