In [0]:
from pyspark.sql import functions as F
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [0]:
catalog_name = "movies"

In [0]:
df_reviews = spark.table(f'{catalog_name}.silver.silver_imdb_reviews_enriched')
display(df_reviews.limit(5))

In [0]:
df_reviews.groupBy("sentiment_label").count().show()


In [0]:
df_reviews = df_reviews.filter(F.col("sentiment_label").isin("positive", "negative"))

In [0]:
df = df_reviews.select("review", "sentiment_label").toPandas()

In [0]:
df["label"] = df["sentiment_label"].map({"positive": 1, "negative": 0})

In [0]:
df.head(4)

In [0]:
X = df["review"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [0]:
mlflow.set_experiment("/Sentiment_Classification")
tfidf_features = [3000, 5000]
C_values = [0.1, 1.0, 10.0]

In [0]:
for max_feat in tfidf_features:
    for C in C_values:
        with mlflow.start_run(run_name=f"logreg_tfidf_{max_feat}_C{C}"):

            # TF-IDF
            tfidf = TfidfVectorizer(
                max_features=max_feat,
                stop_words="english"
            )

            X_train_vec = tfidf.fit_transform(X_train)
            X_test_vec = tfidf.transform(X_test)

            # Model
            model = LogisticRegression(C=C, max_iter=1000)
            model.fit(X_train_vec, y_train)

            # Evaluation
            y_pred = model.predict(X_test_vec)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Log to MLflow
            mlflow.log_param("model", "LogisticRegression")
            mlflow.log_param("max_features", max_feat)
            mlflow.log_param("C", C)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)

            mlflow.sklearn.log_model(model, "model")

In [0]:
experiment = mlflow.get_experiment_by_name("/Sentiment_Classification")
experiment_id = experiment.experiment_id

In [0]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

best_run = client.search_runs(
    experiment_ids=experiment_id,   
    order_by=["metrics.f1_score DESC"],
    max_results=1
)[0]


In [0]:
print("Best Run ID:", best_run.info.run_id)
print("Best Params:", best_run.data.params)
print("Best Metrics:", best_run.data.metrics)
