# Iteración 1: Modelo básico

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("ALS-Reco")
    .master("local[*]")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.ui.enabled", "false")
    .config("spark.ui.showConsoleProgress", "false")
    .getOrCreate()
)

print("Spark version:", spark.version)


Spark version: 3.5.0


In [3]:
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

df = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet")
print("df loaded:", df.count())

ratings = (
    df.select("userId", "filmId", "rating")
    .dropna()
    .withColumn("userId", F.col("userId").cast("int"))
    .withColumn("filmId", F.col("filmId").cast("int"))
    .withColumn("rating", F.col("rating").cast("float"))
    .dropDuplicates(["userId", "filmId"])
)
print("ratings:", ratings.count())

train, test = ratings.randomSplit([0.8, 0.2], seed=42)
print("train:", train.count(), "test:", test.count())

als = ALS(
    userCol="userId",
    itemCol="filmId",
    ratingCol="rating",
    rank=20,
    maxIter=15,
    regParam=0.1,
    coldStartStrategy="drop",
    nonnegative=True
)

model = als.fit(train)
preds = model.transform(test)

rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction").evaluate(preds)
print(f"RMSE: {rmse:.4f}")

model.recommendForAllUsers(5).show(10, truncate=False)


df loaded: 1000209
ratings: 1000209
train: 799983 test: 200226
RMSE: 0.8627
+------+-------------------------------------------------------------------------------------------+
|userId|recommendations                                                                            |
+------+-------------------------------------------------------------------------------------------+
|1     |[{572, 5.4786415}, {318, 4.6444197}, {3233, 4.636641}, {527, 4.6278057}, {953, 4.5249}]    |
|3     |[{572, 5.399514}, {37, 4.7515473}, {811, 4.6856203}, {110, 4.5912414}, {318, 4.5528603}]   |
|5     |[{3338, 4.3667197}, {1743, 4.30837}, {2309, 4.2805104}, {557, 4.1735425}, {771, 4.094903}] |
|6     |[{572, 5.6090546}, {687, 5.005961}, {985, 4.822717}, {1164, 4.805526}, {2197, 4.75691}]    |
|9     |[{572, 4.7859836}, {318, 4.5096383}, {1851, 4.474278}, {2905, 4.4189277}, {50, 4.398362}]  |
|12    |[{572, 5.2123504}, {858, 4.692564}, {2309, 4.6253304}, {318, 4.5641427}, {2905, 4.5414977}]|
|13    |[{572, 

In [3]:
from pyspark.ml.recommendation import ALSModel

model_path = "/home/jovyan/work/Modelo/Modelo_als/als1"  # ajusta si cambiaste la ruta
loaded = ALSModel.load(model_path)

to_score = spark.createDataFrame(
    [(1, 10), (1, 20), (2, 10), (3, 50)],
    ["userId", "filmId"]
)

loaded.transform(to_score).show(truncate=False)


+------+------+----------+
|userId|filmId|prediction|
+------+------+----------+
|1     |10    |3.3299048 |
|1     |20    |2.7066665 |
|2     |10    |3.3457944 |
|3     |50    |4.482435  |
+------+------+----------+



---

# Iteración 2: Modelo básico con aviso de porcentaje de entrenamiento estimado

In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("ALS-Reco")
    .master("local[*]")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

print("Spark version:", spark.version)


Spark version: 3.5.0


In [2]:
spark.sparkContext.setLogLevel("INFO")

In [13]:
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

df = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet")
print("df loaded:", df.count())

ratings = (
    df.select("userId", "filmId", "rating")
    .dropna()
    .withColumn("userId", F.col("userId").cast("int"))
    .withColumn("filmId", F.col("filmId").cast("int"))
    .withColumn("rating", F.col("rating").cast("float"))
    .dropDuplicates(["userId", "filmId"])
)
print("ratings:", ratings.count())

train, test = ratings.randomSplit([0.8, 0.2], seed=42)
print("train:", train.count(), "test:", test.count())

als = ALS(
    userCol="userId",
    itemCol="filmId",
    ratingCol="rating",
    rank=20,
    maxIter=15,
    regParam=0.1,
    coldStartStrategy="drop",
    nonnegative=True
)


df loaded: 1000209
ratings: 1000209
train: 799983 test: 200226


In [19]:
import threading, time, requests

stop_flag = False

def monitor_jobs(ui_port=4040, interval=0.5):
    base = f"http://localhost:{ui_port}/api/v1"
    try:
        app_id = requests.get(f"{base}/applications").json()[0]["id"]
    except Exception:
        return
    while not stop_flag:
        try:
            jobs = requests.get(f"{base}/applications/{app_id}/jobs").json()
            active = [j for j in jobs if j.get("status") == "RUNNING"]
            if active:
                j = active[0]
                done = j.get("numCompletedTasks", 0)
                total = j.get("numTasks", 0) or 1
                pct = 100 * done / total
                print(f"[job] {j['jobId']} {pct:.1f}% ({done}/{total})")
            time.sleep(interval)
        except Exception:
            break

t = threading.Thread(target=monitor_jobs, daemon=True)
t.start()

model = als.fit(train)

stop_flag = True
t.join()


[job] 194 0.0% (0/2)
[job] 196 0.0% (0/16)
[job] 196 0.0% (0/16)
[job] 196 6.2% (1/16)
[job] 196 25.0% (4/16)
[job] 197 12.5% (2/16)
[job] 198 3.1% (10/318)
[job] 198 7.2% (23/318)
[job] 198 10.4% (33/318)
[job] 198 12.6% (40/318)
[job] 198 15.7% (50/318)
[job] 198 18.9% (60/318)
[job] 198 22.0% (70/318)
[job] 198 25.2% (80/318)
[job] 198 27.0% (86/318)
[job] 198 29.9% (95/318)
[job] 198 33.0% (105/318)
[job] 198 36.8% (117/318)
[job] 198 39.9% (127/318)
[job] 198 42.8% (136/318)
[job] 198 45.6% (145/318)
[job] 198 48.4% (154/318)
[job] 198 51.9% (165/318)
[job] 198 55.0% (175/318)
[job] 198 56.6% (180/318)
[job] 198 59.7% (190/318)
[job] 198 62.9% (200/318)
[job] 198 66.4% (211/318)
[job] 198 69.2% (220/318)
[job] 198 72.3% (230/318)
[job] 198 75.5% (240/318)
[job] 198 78.6% (250/318)
[job] 198 81.8% (260/318)
[job] 198 84.9% (270/318)
[job] 198 86.8% (276/318)
[job] 198 89.9% (286/318)
[job] 198 91.8% (292/318)
[job] 198 95.0% (302/318)
[job] 199 0.0% (0/308)


In [22]:
import threading, time, requests

stop_flag = False

def monitor_jobs(ui_port=4040, interval=0.5):
    base = f"http://localhost:{ui_port}/api/v1"
    
    # 1. Mapa de traducción: De "Código Feo" a "Humano"
    # Estos números (1090, 1095) son las líneas de código de Spark donde ocurre la magia.
    translations = {
        "ALS.scala:1090": "Entrenando (Iteraciones Usuarios)",
        "ALS.scala:1095": "Finalizando (Matriz Items)",
        "ALS.scala:988":  "Pre-procesamiento (Bloques)",
        "ALS.scala:995":  "Pre-procesamiento (Estadísticas)"
    }

    try:
        app_id = requests.get(f"{base}/applications").json()[0]["id"]
    except Exception:
        return

    while not stop_flag:
        try:
            jobs = requests.get(f"{base}/applications/{app_id}/jobs").json()
            # Filtramos solo los RUNNING
            active = [j for j in jobs if j.get("status") == "RUNNING"]
            
            if active:
                j = active[0]
                done = j.get("numCompletedTasks", 0)
                total = j.get("numTasks", 0) or 1
                pct = 100 * done / total
                
                # Obtenemos el nombre técnico original
                raw_name = j.get("name", "")
                
                # 2. Buscamos si el nombre técnico contiene alguna de nuestras claves
                readable_name = "Procesando..." # Default
                for key, val in translations.items():
                    if key in raw_name:
                        readable_name = val
                        break
                
                # Si no encontramos traducción, usamos el nombre original recortado
                if readable_name == "Procesando...":
                     readable_name = raw_name[:30]

                # Imprimimos bonito
                # \r al principio permite sobrescribir la línea (opcional, si te gusta efecto barra de carga)
                print(f"[Job {j['jobId']}] {readable_name} -> {pct:.1f}% ({done}/{total})")
            
            time.sleep(interval)
        except Exception:
            break

t = threading.Thread(target=monitor_jobs, daemon=True)
t.start()

model = als.fit(train)

stop_flag = True
t.join()


[Job 206] rdd at ALS.scala:727 -> 0.0% (0/2)
[Job 207] isEmpty at ALS.scala:975 -> 0.0% (0/3)
[Job 208] Pre-procesamiento (Bloques) -> 0.0% (0/16)
[Job 208] Pre-procesamiento (Bloques) -> 6.2% (1/16)
[Job 208] Pre-procesamiento (Bloques) -> 37.5% (6/16)
[Job 209] Pre-procesamiento (Estadísticas) -> 12.5% (2/16)
[Job 210] Entrenando (Iteraciones Usuarios) -> 4.4% (14/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 6.9% (22/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 10.4% (33/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 12.6% (40/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 13.8% (44/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 16.7% (53/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 19.2% (61/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 22.3% (71/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 25.2% (80/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 28.3% (90/318)
[Job 210] Entrenando (Iteraciones Usuarios) -> 31.4% (100/318)
[Job 2

In [23]:
preds = model.transform(test)

rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction").evaluate(preds)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.8621


---

# Iteración 3: Modelo mejorado con mejores hiperparámetros

## Búsqueda de hiperparámetros

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("ALS-Reco")
    .master("local[*]")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

print("Spark version:", spark.version)

Spark version: 3.5.0


In [3]:
import pyspark.sql.functions as F
from pyspark.ml.evaluation import RegressionEvaluator

df = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet")
print("df loaded:", df.count())

ratings = (
    df.select("userId", "filmId", "rating")
    .dropna()
    .withColumn("userId", F.col("userId").cast("int"))
    .withColumn("filmId", F.col("filmId").cast("int"))
    .withColumn("rating", F.col("rating").cast("float"))
    .dropDuplicates(["userId", "filmId"])
)
print("ratings:", ratings.count())

df loaded: 1000209
ratings: 1000209


In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

als = ALS(
    userCol="userId",
    itemCol="filmId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    seed=42,
)

paramGrid = (ParamGridBuilder()
             .addGrid(als.rank, [10, 20])
             .addGrid(als.regParam, [0.05, 0.1])
             .addGrid(als.maxIter, [10, 15])
             .build())

tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction"),
    trainRatio=0.8, 
    parallelism=2    
)

model_tuned = tvs.fit(ratings)
best_model = model_tuned.bestModel
print("Mejores hiperparámetros:", best_model.rank, best_model._java_obj.parent().getRegParam(), best_model._java_obj.parent().getMaxIter())


Mejores hiperparámetros: 20 0.1 15


Casualmente (o no tan casualmente) los hiperparámetros resultantes son los que ya usaba así que no tengo que reentrenar el modelo. Aún así, este es un ejemplo pequeño, lo suyo sería probar con un grid mucho mayor, más hiperparámetros y, sobre todo, más iteraciones, pero Codespace da para lo que da.