
# 1. Bibliotecas

In [0]:
from pyspark.ml.fpm import FPGrowth
from datetime import datetime
from pyspark.sql.types import DateType, StringType
from pyspark.sql.functions import size, col, collect_list, collect_set, concat_ws, lit

---
# 2. Importando Dados

In [0]:
df_order_items = spark.table('datum.silver.olist_order_items')
df_products = spark.table('datum.silver.olist_products')
df_products = df_products.dropna(subset=['product_category_name'])

---
# 3. Aplicando modelo de ML

Foi usado o seguinte artigo como base para aplicação de um modelo já existente dentro do PySpark chamado de FPGrowth

<a href="https://www.databricks.com/blog/2018/09/18/simplify-market-basket-analysis-using-fp-growth-on-databricks.html">Marker Basket</a>

In [0]:
df_join = df_order_items.join(df_products, df_order_items.product_id == df_products.product_id, 'left').drop(df_products.product_id)

In [0]:
df_grouped = df_join.groupBy('order_id').agg(collect_set('product_category_name').alias('products'))

In [0]:
fpGrowth = FPGrowth(itemsCol='products', minSupport=0.0001, minConfidence=0.001)
model = fpGrowth.fit(df_grouped)

In [0]:
frequent_itemsets = model.freqItemsets
df_most_common_product_pair = frequent_itemsets.filter(size(frequent_itemsets['items']) == 2)

In [0]:
association_rules = model.associationRules

In [0]:
association_rules.display()

antecedent,consequent,confidence,lift,support
List(moveis_decoracao),List(cama_mesa_banho),0.0108543960303923,0.1137262226542091,0.0007094642531368455
List(moveis_decoracao),List(construcao_ferramentas_iluminacao),0.0017056908047759,0.6897282333771407,0.00011148723977864716
List(moveis_decoracao),List(utilidades_domesticas),0.0037215072104202,0.0624041859998846,0.0002432448867897756
List(moveis_decoracao),List(bebes),0.0018607536052101,0.0636371283229326,0.0001216224433948878
List(moveis_decoracao),List(casa_construcao),0.0020158164056442,0.4059031458761207,0.00013175764701112846
List(moveis_decoracao),List(ferramentas_jardim),0.0026360676073809,0.0739312809976841,0.00017229846147609105
List(cool_stuff),List(bebes),0.0055066079295154,0.1883240824865054,0.000202704072324813
List(cool_stuff),List(cama_mesa_banho),0.0027533039647577,0.0288475617486231,0.0001013520361624065
List(esporte_lazer),List(utilidades_domesticas),0.0014248704663212,0.0238929757698932,0.00011148723977864716
List(esporte_lazer),List(beleza_saude),0.0018134715025906,0.020249884481056,0.0001418928506273691



Conforme vemos no artigo o modelo procura prever qual a próxima categoria de produto que será comprada

In [0]:
df_most_common_product_pair = df_most_common_product_pair.withColumn(
    'categoria_produtos', concat_ws(', ', 'items')).select('categoria_produtos', 'freq')

In [0]:
df_most_common_product_pair = df_most_common_product_pair.withColumn('date_ref_carga', lit(datetime.now()).cast(DateType())).withColumn('freq', col('freq').cast(StringType()))

In [0]:
df_most_common_product_pair.orderBy('freq', ascending=False).display()

categoria_produtos,freq,date_ref_carga
"moveis_decoracao, cama_mesa_banho",70,2024-04-14
"casa_conforto, cama_mesa_banho",43,2024-04-14
"utilidades_domesticas, moveis_decoracao",24,2024-04-14
"utilidades_domesticas, cama_mesa_banho",20,2024-04-14
"bebes, cool_stuff",20,2024-04-14
"bebes, brinquedos",19,2024-04-14
"ferramentas_jardim, moveis_decoracao",17,2024-04-14
"bebes, cama_mesa_banho",17,2024-04-14
"esporte_lazer, beleza_saude",14,2024-04-14
"casa_construcao, moveis_decoracao",13,2024-04-14


---
# 4. Agrupando dados para termos a mesma visualização do modelo mas com os dados disponíveis

In [0]:
%sql

WITH
orders AS (
  SELECT DISTINCT order_id, product_id
  FROM datum.silver.olist_order_items
),

agrupado AS (
  SELECT order_id, COLLECT_LIST(product_category_name) AS product_category_name
  FROM orders AS o
    LEFT JOIN datum.silver.olist_products AS p
      ON o.product_id = p.product_id
  GROUP BY order_id
  HAVING SIZE(COLLECT_SET(p.product_category_name)) == 2)

SELECT REPLACE(REPLACE(CAST(product_category_name AS STRING), '[', ''), ']', '') AS categoria_produtos, 
       CAST(COUNT(*) AS STRING) AS total,
       CAST(NOW() AS DATE) AS date_ref_carga
FROM agrupado
GROUP BY product_category_name
ORDER BY COUNT(*) DESC

categoria_produtos,total,date_ref_carga
"moveis_decoracao, cama_mesa_banho",30,2024-04-14
"cama_mesa_banho, moveis_decoracao",24,2024-04-14
"casa_conforto, cama_mesa_banho",20,2024-04-14
"cama_mesa_banho, casa_conforto",17,2024-04-14
"bebes, brinquedos",14,2024-04-14
"cama_mesa_banho, utilidades_domesticas",12,2024-04-14
"utilidades_domesticas, moveis_decoracao",12,2024-04-14
"bebes, cool_stuff",9,2024-04-14
"moveis_decoracao, utilidades_domesticas",9,2024-04-14
"ferramentas_jardim, moveis_decoracao",8,2024-04-14


---
# 5. Delta Lake

In [0]:
%sql

USE CATALOG datum

In [0]:
%sql

USE DATABASE gold

In [0]:
%sql

CREATE TABLE IF NOT EXISTS olist_ml_pares_produtos
(
  categoria_produtos STRING,
  freq               STRING,
  date_ref_carga     DATE
)
USING DELTA
LOCATION 'abfss://unity-datum@datumunity.dfs.core.windows.net/gold/olist_ml_pares_produtos'
PARTITIONED BY (date_ref_carga)

In [0]:
if df_most_common_product_pair.count() != 0 and df_most_common_product_pair is not None:
    df_most_common_product_pair.write.format('delta').mode('overwrite').save('abfss://unity-datum@datumunity.dfs.core.windows.net/gold/olist_ml_pares_produtos')

In [0]:
%sql

CREATE TABLE IF NOT EXISTS olist_pares_produtos
(
  categoria_produtos STRING,
  total              STRING,
  date_ref_carga     DATE
)
USING DELTA
LOCATION 'abfss://unity-datum@datumunity.dfs.core.windows.net/gold/olist_pares_produtos'
PARTITIONED BY (date_ref_carga)

In [0]:
if _sqldf.count() != 0 and _sqldf is not None:
    _sqldf.write.format('delta').mode('overwrite').save('abfss://unity-datum@datumunity.dfs.core.windows.net/gold/olist_pares_produtos')

In [0]:
del df_most_common_product_pair, _sqldf