In [1]:
from pathlib import Path
from typing import Dict, Any
import mlflow

# Paths and constants
DATA_PATH = Path(
    "../data/gold/sentiment/amazon_reviews_furniture/sentiment.parquet/")
EXPERIMENT_NAME = "sentiment_baselines"
MLFLOW_URI = "file:../mlruns"
RANDOM_SEED = 42
TARGET_COL = "sentiment_label"
TEXT_COL = "clean_text"

# Model save locations
OUTPUT_DIR = Path("../models")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location=('file:///Users/leandrohermann/Library/CloudStorage/OneDrive-Personal/ITBA/Big '
 'Data/tp/itba-bigdata/notebooks/../mlruns/206939664416179806'), creation_time=1763089599312, experiment_id='206939664416179806', last_update_time=1763089599312, lifecycle_stage='active', name='sentiment_baselines', tags={'mlflow.experimentKind': 'custom_model_development', 'step': 'model_training'}>

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

spark = SparkSession.builder \
    .appName("SentimentAnalysisPrototype") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
logger.info(f"Spark version: {spark.version}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 21:14:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data
logger.info(f"Loading parquet from {DATA_PATH}")
df = spark.read.parquet(str(DATA_PATH))
logger.info(f"Rows: {df.count()}, Columns: {len(df.columns)}")
df.printSchema()

                                                                                

root
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- clean_text: string (nullable = true)
 |-- sentiment_label: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- review_date: date (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- verified_purchase: boolean (nullable = true)



In [4]:
# Quick preview and basic quality checks
display_cols = ["review_id", "product_id", "clean_text",
                "sentiment_label", "star_rating", "review_date"]
df.select(*display_cols).limit(10).toPandas()

Unnamed: 0,review_id,product_id,clean_text,sentiment_label,star_rating,review_date
0,R1000ZEHB3O259,B0044WWL34,using on a front hall throw rug and the grippi...,positive,5,2015-03-20
1,R1002K4NKXO8VI,B00166GCJ0,it's a cheap chair. it doesn't cost much and i...,neutral,3,2009-08-27
2,R1002MW05H9LI,B00EQG7KEI,"wonderful , sturdy , easy to set up. no box sp...",positive,5,2015-07-23
3,R10034VWRCN7PM,B006JG0HMK,i had 3 issues with the product upon receipt. ...,negative,1,2015-06-30
4,R1003EZF56E6JB,B0042D8VRU,the ups guy said this was the heaviest item he...,positive,5,2012-12-23
5,R1003L1ICGH3Y8,B00E44V6II,nice. i have not had a wake up call from someo...,positive,5,2015-01-14
6,R10046A1NGXKTX,B00JJZ18PS,been 1 week and my sleep has been better. the ...,positive,4,2014-08-13
7,R1004LAAZLOLYN,B00WEYR8KU,this is just ok for the price. you can take th...,neutral,3,2015-07-28
8,R1004ZAILG13HW,B00E8ONKB0,making a fleece tie blanket with the material....,positive,5,2014-10-19
9,R1005692J177UC,B008KP6D2G,"great looking rug, a little thin but overall r...",positive,4,2015-05-15


In [5]:
# ----------------------------------------------------
# Baseline Experiments: LR vs SVM vs Naive Bayes
# ----------------------------------------------------
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LinearSVC, NaiveBayes, OneVsRest
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import mlflow
import mlflow.spark

# Common preprocessing stages
regex_tokenizer = RegexTokenizer(
    inputCol="clean_text", outputCol="tokens", pattern="\\W+"
)
stop_remover = StopWordsRemover(
    inputCol="tokens", outputCol="tokens_nostop"
)
hashing_tf = HashingTF(
    inputCol="tokens_nostop", outputCol="rawFeatures", numFeatures=2**16
)
idf = IDF(inputCol="rawFeatures", outputCol="features")
label_indexer = StringIndexer(
    inputCol="sentiment_label", outputCol="label_idx", handleInvalid="keep"
)

# Train/test split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

# Evaluation function
evaluator = MulticlassClassificationEvaluator(
    labelCol="label_idx", predictionCol="prediction", metricName="f1"
)


def run_experiment(model_name, classifier, params: dict):
    """Run one ML experiment with MLflow tracking."""

    with mlflow.start_run(run_name=model_name):

        # Log parameters
        for k, v in params.items():
            mlflow.log_param(k, v)

        # Build pipeline
        pipeline = Pipeline(stages=[
            regex_tokenizer,
            stop_remover,
            hashing_tf,
            idf,
            label_indexer,
            classifier
        ])

        # Train
        model = pipeline.fit(train_df)

        # Predict
        preds = model.transform(test_df)

        # Evaluate
        f1 = evaluator.evaluate(preds)
        mlflow.log_metric("f1_score", f1)

        # Log model artifact
        mlflow.spark.log_model(model, artifact_path=f"{model_name}_model")

        print(f"{model_name} | F1 = {f1:.4f}")

        return model, f1

25/12/07 21:14:59 WARN StopWordsRemover: Default locale set was [en_AR]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [6]:
lr_variants = [
    ("LR_reg_0.01", LogisticRegression(labelCol="label_idx", featuresCol="features",
                                       maxIter=20, regParam=0.01),
     {"regParam": 0.01, "maxIter": 20}),

    ("LR_reg_0.1", LogisticRegression(labelCol="label_idx", featuresCol="features",
                                      maxIter=20, regParam=0.1),
     {"regParam": 0.1, "maxIter": 20})
]
svm_variants = [
    ("SVM_OVR_reg_0.1",
     OneVsRest(
         classifier=LinearSVC(
             labelCol="label_idx",
             featuresCol="features",
             maxIter=20,
             regParam=0.1
         ),
         labelCol="label_idx",
         featuresCol="features"
     ),
     {"model": "LinearSVC OVR", "regParam": 0.1, "maxIter": 20}
     ),

    ("SVM_OVR_reg_0.01",
     OneVsRest(
         classifier=LinearSVC(
             labelCol="label_idx",
             featuresCol="features",
             maxIter=30,
             regParam=0.01
         ),
         labelCol="label_idx",
         featuresCol="features"
     ),
     {"model": "LinearSVC OVR", "regParam": 0.01, "maxIter": 30}
     )
]
nb_variants = [
    ("NB_smoothing_1", NaiveBayes(labelCol="label_idx", featuresCol="features",
                                  modelType="multinomial", smoothing=1.0),
     {"smoothing": 1.0}),

    ("NB_smoothing_0.5", NaiveBayes(labelCol="label_idx", featuresCol="features",
                                    modelType="multinomial", smoothing=0.5),
     {"smoothing": 0.5})
]

In [None]:
all_variants = lr_variants + nb_variants + svm_variants


results = {}
mlflow.set_experiment("sentiment_baselines")

for model_name, clf, params in all_variants:
    model, f1 = run_experiment(model_name, clf, params)
    results[model_name] = f1

print("All results:", results)

25/12/07 21:15:06 WARN DAGScheduler: Broadcasting large task binary with size 1129.9 KiB
25/12/07 21:15:10 WARN DAGScheduler: Broadcasting large task binary with size 1131.0 KiB
25/12/07 21:15:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/12/07 21:15:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/12/07 21:15:11 WARN DAGScheduler: Broadcasting large task binary with size 1130.5 KiB
25/12/07 21:15:13 WARN MemoryStore: Not enough space to cache rdd_50_5 in memory! (computed 17.0 MiB so far)
25/12/07 21:15:13 WARN BlockManager: Persisting block rdd_50_5 to disk instead.
25/12/07 21:15:13 WARN MemoryStore: Not enough space to cache rdd_50_1 in memory! (computed 17.0 MiB so far)
25/12/07 21:15:13 WARN BlockManager: Persisting block rdd_50_1 to disk instead.
25/12/07 21:15:14 WARN MemoryStore: Not enough space to cache rdd_50_7 in memory! (computed 33.0 MiB so far)
25/12/07 21:15:14 WARN Memory

LR_reg_0.01 | F1 = 0.8287


25/12/07 21:15:37 WARN DAGScheduler: Broadcasting large task binary with size 1129.9 KiB
25/12/07 21:15:41 WARN DAGScheduler: Broadcasting large task binary with size 1131.0 KiB
25/12/07 21:15:41 WARN DAGScheduler: Broadcasting large task binary with size 1130.5 KiB
25/12/07 21:15:43 WARN MemoryStore: Not enough space to cache rdd_217_1 in memory! (computed 17.0 MiB so far)
25/12/07 21:15:43 WARN BlockManager: Persisting block rdd_217_1 to disk instead.
25/12/07 21:15:43 WARN MemoryStore: Not enough space to cache rdd_217_5 in memory! (computed 17.0 MiB so far)
25/12/07 21:15:43 WARN BlockManager: Persisting block rdd_217_5 to disk instead.
25/12/07 21:15:44 WARN MemoryStore: Not enough space to cache rdd_217_7 in memory! (computed 33.0 MiB so far)
25/12/07 21:15:44 WARN BlockManager: Persisting block rdd_217_7 to disk instead.
25/12/07 21:15:44 WARN MemoryStore: Not enough space to cache rdd_217_3 in memory! (computed 33.0 MiB so far)
25/12/07 21:15:44 WARN BlockManager: Persisting bl

LR_reg_0.1 | F1 = 0.8043


25/12/07 21:16:04 WARN DAGScheduler: Broadcasting large task binary with size 1126.8 KiB
25/12/07 21:16:09 WARN DAGScheduler: Broadcasting large task binary with size 1105.5 KiB
25/12/07 21:16:09 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/12/07 21:16:11 WARN TaskSetManager: Stage 157 contains a task of very large size (1054 KiB). The maximum recommended task size is 1000 KiB.
25/12/07 21:16:11 WARN TaskSetManager: Stage 165 contains a task of very large size (1576 KiB). The maximum recommended task size is 1000 KiB.


NB_smoothing_1 | F1 = 0.8012


25/12/07 21:16:24 WARN DAGScheduler: Broadcasting large task binary with size 1126.8 KiB
25/12/07 21:16:29 WARN DAGScheduler: Broadcasting large task binary with size 1105.5 KiB
25/12/07 21:16:29 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/12/07 21:16:31 WARN TaskSetManager: Stage 183 contains a task of very large size (1054 KiB). The maximum recommended task size is 1000 KiB.
25/12/07 21:16:31 WARN TaskSetManager: Stage 191 contains a task of very large size (1576 KiB). The maximum recommended task size is 1000 KiB.


NB_smoothing_0.5 | F1 = 0.8003


25/12/07 21:16:44 WARN DAGScheduler: Broadcasting large task binary with size 1103.1 KiB
25/12/07 21:16:49 WARN BlockManager: Block rdd_525_7 could not be removed as it was not found on disk or in memory
25/12/07 21:16:49 ERROR Executor: Exception in task 7.0 in stage 202.0 (TID 1030)
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:64)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:363)
	at org.apache.spark.sql.execution.columnar.ColumnBuilder$.ensureFreeSpace(ColumnBuilder.scala:167)
	at org.apache.spark.sql.execution.columnar.BasicColumnBuilder.appendFrom(ColumnBuilder.scala:73)
	at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.org$apache$spark$sql$execution$columnar$NullableColumnBuilder$$super$appendFrom(ColumnBuilder.scala:93)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.appendFrom(NullableColumnBuilder.scala:61)
	at org.apache.spark.sql.execution.columnar.NullableColumnBui

ConnectionRefusedError: [Errno 61] Connection refused