In [0]:
import pyspark.sql.functions as F

In [0]:
df_sales_customers = spark.read.table("samples.bakehouse.sales_customers")
df_sales_customers.printSchema()
df_sales_customers.limit(3).toPandas()

In [0]:
# distinct values per each column: 300 rows
df_sales_customers.select([F.countDistinct(F.col(c)).alias(c) for c in df_sales_customers.columns]).show()

# count NULLs per each column: no NULLs, this is nice
df_sales_customers.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_sales_customers.columns]).show()

In `01_bakehouse_exloratory_analysis` I noticed 'Matthew' can be female.

Let's create a view and quickly see if we have other issues like names with both genders


In [0]:
df_sales_customers.createOrReplaceTempView("vw_sales_customers")


In [0]:
%%sql
WITH CTE_DISTINCT_first_name_gender AS (
    SELECT DISTINCT
         first_name
        ,gender
    FROM vw_sales_customers
)

,CTE_first_name_with_both_genders AS (
    SELECT 
         first_name
        ,COUNT(1) AS count_first_name
    FROM CTE_DISTINCT_first_name_gender
    GROUP BY 
        first_name
    HAVING 
        COUNT(1) > 1
)

SELECT DISTINCT 
     vw.first_name
    ,vw.gender
    ,cte.count_first_name
FROM vw_sales_customers vw
INNER JOIN CTE_first_name_with_both_genders cte -- INNER JOIN to filter only on duping names with both genders
    ON vw.first_name = cte.first_name 
ORDER BY 
     cte.count_first_name DESC
    ,vw.first_name
    ,vw.gender


There ☝️ are have `74/2 == 37` names with both genders.
This means we need to do some extra work to get the gender right