In [0]:
%sql
-- Describe Bronze table
DESCRIBE TABLE ecommerce.bronze.events;



col_name,data_type,comment
event_time,timestamp,
event_type,string,
product_id,int,
category_id,bigint,
category_code,string,
brand,string,
price,double,
user_id,int,
user_session,string,
ingestion_time,timestamp,


In [0]:
%sql
-- Describe Silver table
DESCRIBE TABLE ecommerce.silver.daily_sales;



col_name,data_type,comment
event_date,date,
event_type,string,
total_events,bigint,
total_revenue,double,


In [0]:
%sql
-- Describe Gold tables
DESCRIBE TABLE ecommerce.gold.products;

DESCRIBE TABLE ecommerce.gold.top_products;

col_name,data_type,comment
event_date,date,
event_type,string,
total_events,bigint,
total_revenue,double,


In [0]:

## Load data
base_df = spark.table("ecommerce.gold.top_products").dropna()
base_df.printSchema()
display(base_df)



root
 |-- event_date: date (nullable = true)
 |-- event_type: string (nullable = true)
 |-- total_events: long (nullable = true)
 |-- total_revenue: double (nullable = true)



event_date,event_type,total_events,total_revenue
2019-11-17,purchase,185195,57774481.92000609
2019-11-17,cart,426941,125024125.30996367
2019-11-17,view,5783241,1693681884.268523
2019-11-19,view,1631029,464031090.5101103
2019-11-20,cart,72404,17775239.260001414
2019-11-19,cart,72545,18588066.890001483
2019-11-19,purchase,24967,7291407.759999855
2019-11-21,purchase,25266,6970110.370000055
2019-11-20,purchase,24947,7089210.269999973
2019-11-16,view,6027932,1847877416.6506195


In [0]:
%pip install transformers torch

Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting torch
  Downloading torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl.metadata (31 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2026.1.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (7.3 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3

In [0]:
## Simple NLP Task

from transformers import pipeline

classifier = pipeline("sentiment-analysis")

reviews = [
    "This product is amazing!",
    "Terrible quality, waste of money",
    "Very satisfied with the purchase",
    "Not worth the price"
]

results = classifier(reviews)
results

  from torch.utils._pytree import _broadcast_to_and_flatten, tree_flatten, tree_unflatten
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998860359191895},
 {'label': 'NEGATIVE', 'score': 0.9998160004615784},
 {'label': 'POSITIVE', 'score': 0.9997738003730774},
 {'label': 'NEGATIVE', 'score': 0.9998005032539368}]

In [0]:

## Log NLP Model with MLflow

import mlflow

with mlflow.start_run(run_name="sentiment_analysis_nlp"):
    mlflow.log_param("model", "distilbert-base-uncased-finetuned-sst-2-english")
    mlflow.log_param("task", "sentiment-analysis")
    
    # Example metric (for demo)
    mlflow.log_metric("sample_accuracy", 0.95)