In [1]:

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import itertools
from itertools import islice
from itertools import product
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import Window, DataFrame
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.functions import col, row_number, expr
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import FloatType, IntegerType, LongType, StringType,StructType, StructField
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from google.cloud import storage
from pyspark.storagelevel import StorageLevel
from pyspark.ml.feature import StringIndexerModel
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics


import os
import time


# Tạo session mới
print("Creating new Spark session...")
spark = (
    SparkSession.builder
    .appName("ALS")
    .getOrCreate()
)

print("Spark master:", spark.sparkContext.master)


Creating new Spark session...
Spark master: yarn


25/04/21 14:17:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# spark.stop()
conf = dict(spark.sparkContext.getConf().getAll())

print("spark.executor.instances:", conf.get("spark.executor.instances"))
print("spark.executor.cores:", conf.get("spark.executor.cores"))
print("spark.executor.memory:", conf.get("spark.executor.memory"))
print("spark.driver.memory:", conf.get("spark.driver.memory"))
print("spark.sql.shuffle.partitions:", conf.get("spark.sql.shuffle.partitions"))
print("spark.kryoserializer.buffer.max:", conf.get("spark.kryoserializer.buffer.max"))
print("spark.kryoserializer.buffer:", conf.get("spark.kryoserializer.buffer"))


spark.executor.instances: 2
spark.executor.cores: 4
spark.executor.memory: 25g
spark.driver.memory: 25g
spark.sql.shuffle.partitions: 100
spark.kryoserializer.buffer.max: 1024m
spark.kryoserializer.buffer: 64m


In [3]:
# Default column names
DEFAULT_USER_COL = "userID"
DEFAULT_ITEM_COL = "itemID"
DEFAULT_RATING_COL = "rating"
DEFAULT_LABEL_COL = "label"
DEFAULT_TITLE_COL = "title"
DEFAULT_GENRE_COL = "genre"
DEFAULT_RELEVANCE_COL = "relevance"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_SIMILARITY_COL = "sim"
DEFAULT_ITEM_FEATURES_COL = "features"
DEFAULT_ITEM_SIM_MEASURE = "item_cooccurrence_count"


COL_DICT = {
    "col_user": DEFAULT_USER_COL,
    "col_item": DEFAULT_ITEM_COL,
    "col_rating": DEFAULT_RATING_COL,
    "col_prediction": DEFAULT_PREDICTION_COL,
}

# Filtering variables
DEFAULT_K = 10
DEFAULT_THRESHOLD = 10

# Other
SEED = 42

client = storage.Client()
bucket_name = "team15-storage"
bucket = client.bucket(bucket_name)


In [5]:
def process_split_ratio(ratio):
    """Generate split ratio lists.

    Args:
        ratio (float or list): a float number that indicates split ratio or a list of float
        numbers that indicate split ratios (if it is a multi-split).

    Returns:
        tuple:
        - bool: A boolean variable multi that indicates if the splitting is multi or single.
        - list: A list of normalized split ratios.
    """
    if isinstance(ratio, float):
        if ratio <= 0 or ratio >= 1:
            raise ValueError("Split ratio has to be between 0 and 1")

        multi = False
    elif isinstance(ratio, list):
        if any([x <= 0 for x in ratio]):
            raise ValueError(
                "All split ratios in the ratio list should be larger than 0."
            )

        # normalize split ratios if they are not summed to 1
        if math.fsum(ratio) != 1.0:
            ratio = [x / math.fsum(ratio) for x in ratio]

        multi = True
    else:
        raise TypeError("Split ratio should be either float or a list of floats.")

    return multi, ratio

In [6]:
def spark_random_split(data, ratio=0.75, seed=42):
    """Spark random splitter.

    Randomly split the data into several splits.

    Args:
        data (pyspark.sql.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halves and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list
            is provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.

    Returns:
        list: Splits of the input data as pyspark.sql.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    if multi_split:
        return data.randomSplit(ratio, seed=seed)
    else:
        return data.randomSplit([ratio, 1 - ratio], seed=seed)



In [7]:
class SparkRatingEvaluation:
    """Spark Rating Evaluator"""

    def __init__(
        self,
        rating_true,
        rating_pred,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
    ):
        """Initializer.

        Args:
            rating_true (pyspark.sql.DataFrame): True labels.
            rating_pred (pyspark.sql.DataFrame): Predicted labels.
            col_user (str): column name for user.
            col_item (str): column name for item.
            col_rating (str): column name for rating.
            col_prediction (str): column name for prediction.
        """
        self.rating_true = rating_true
        self.rating_pred = rating_pred
        self.col_user = col_user
        self.col_item = col_item
        self.col_rating = col_rating
        self.col_prediction = col_prediction

        # Check if inputs are Spark DataFrames.
        self._validate_dataframes()

        # Repartitioning for performance optimization.
        self.rating_true = self.rating_true.repartition(200, col(self.col_user))
        self.rating_pred = self.rating_pred.repartition(200, col(self.col_user))

        # Select necessary columns and cast types for consistency
        self.rating_true = self.rating_true.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_rating).cast("double").alias("label"),
        )
        self.rating_pred = self.rating_pred.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_prediction).cast("double").alias("prediction"),
        )

        # Perform the join operation
        self.y_pred_true = self.rating_true.join(
            self.rating_pred, [self.col_user, self.col_item], "inner"
        ).drop(self.col_user).drop(self.col_item)

        # Persist the joined data for repeated access.
        self.y_pred_true.persist(StorageLevel.MEMORY_AND_DISK)

        # Initialize metrics calculation.
        self.metrics = RegressionMetrics(self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label)))

    def _validate_dataframes(self):
        """Helper function to validate input DataFrames."""
        if not isinstance(self.rating_true, DataFrame):
            raise TypeError("rating_true should be a Spark DataFrame.")
        if not isinstance(self.rating_pred, DataFrame):
            raise TypeError("rating_pred should be a Spark DataFrame.")

        true_columns = self.rating_true.columns
        pred_columns = self.rating_pred.columns

        if self.col_user not in true_columns or self.col_item not in true_columns or self.col_rating not in true_columns:
            raise ValueError("Schema of rating_true is missing one or more required columns.")
        if self.col_user not in pred_columns or self.col_item not in pred_columns or self.col_prediction not in pred_columns:
            raise ValueError("Schema of rating_pred is missing one or more required columns.")

        if self.rating_true.count() == 0 or self.rating_pred.count() == 0:
            raise ValueError("Empty input DataFrame.")

    def rmse(self):
        """Calculate Root Mean Squared Error."""
        return self.metrics.rootMeanSquaredError

    def mae(self):
        """Calculate Mean Absolute Error."""
        return self.metrics.meanAbsoluteError

    def rsquared(self):
        """Calculate R squared."""
        return self.metrics.r2

    def exp_var(self):
        """Calculate explained variance."""
        # Use var() calculation directly on the DataFrame
        variance_diff = self.y_pred_true.selectExpr("variance(label - prediction)").collect()[0][0]
        variance_label = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]

        if variance_diff is None or variance_label is None:
            return -np.inf
        else:
            return 1 - (variance_diff / variance_label) if variance_label != 0 else -np.inf


In [8]:
class SparkRankingEvaluation:
    """Spark Ranking Evaluator"""

    def __init__(
        self,
        rating_true,
        rating_pred,
        k=DEFAULT_K,
        relevancy_method="top_k",
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD,
    ):
        """Initialization.
        This is the Spark version of ranking metrics evaluator.
        The methods of this class, calculate ranking metrics such as precision@k, recall@k, ndcg@k, and mean average
        precision.

        The implementations of precision@k, ndcg@k, and mean average precision are referenced from Spark MLlib, which
        can be found at `the link <https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems>`_.

        Args:
            rating_true (pyspark.sql.DataFrame): DataFrame of true rating data (in the
                format of customerID-itemID-rating tuple).
            rating_pred (pyspark.sql.DataFrame): DataFrame of predicted rating data (in
                the format of customerID-itemID-rating tuple).
            col_user (str): column name for user.
            col_item (str): column name for item.
            col_rating (str): column name for rating.
            col_prediction (str): column name for prediction.
            k (int): number of items to recommend to each user.
            relevancy_method (str): method for determining relevant items. Possible
                values are "top_k", "by_time_stamp", and "by_threshold".
            threshold (float): threshold for determining the relevant recommended items.
                This is used for the case that predicted ratings follow a known
                distribution. NOTE: this option is only activated if `relevancy_method` is
                set to "by_threshold".
        """
        self.rating_true = rating_true
        self.rating_pred = rating_pred
        self.col_user = col_user
        self.col_item = col_item
        self.col_rating = col_rating
        self.col_prediction = col_prediction
        self.threshold = threshold

        # Check if inputs are Spark DataFrames.
        if not isinstance(self.rating_true, DataFrame):
            raise TypeError(
                "rating_true should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        if not isinstance(self.rating_pred, DataFrame):
            raise TypeError(
                "rating_pred should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        # Check if columns exist.
        true_columns = self.rating_true.columns
        pred_columns = self.rating_pred.columns

        if self.col_user not in true_columns:
            raise ValueError(
                "Schema of rating_true not valid. Missing User Col: "
                + str(true_columns)
            )
        if self.col_item not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Item Col")
        if self.col_rating not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Rating Col")

        if self.col_user not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing User Col"
            )  # pragma : No Cover
        if self.col_item not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing Item Col"
            )  # pragma : No Cover
        if self.col_prediction not in pred_columns:
            raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")

        self.k = k

        relevant_func = {
            "top_k": _get_top_k_items,
            "by_time_stamp": _get_relevant_items_by_timestamp,
            "by_threshold": _get_relevant_items_by_threshold,
        }

        if relevancy_method not in relevant_func:
            raise ValueError(
                "relevancy_method should be one of {}".format(
                    list(relevant_func.keys())
                )
            )

        self.rating_pred = (
            relevant_func[relevancy_method](
                dataframe=self.rating_pred,
                col_user=self.col_user,
                col_item=self.col_item,
                col_rating=self.col_prediction,
                threshold=self.threshold,
            )
            if relevancy_method == "by_threshold"
            else relevant_func[relevancy_method](
                dataframe=self.rating_pred,
                col_user=self.col_user,
                col_item=self.col_item,
                col_rating=self.col_prediction,
                k=self.k,
            )
        )

        self._metrics = self._calculate_metrics()

    def _calculate_metrics(self):
        """Calculate ranking metrics."""
        self._items_for_user_pred = self.rating_pred

        self._items_for_user_true = (
            self.rating_true.groupBy(self.col_user)
            .agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
            .select(self.col_user, "ground_truth")
        )

        self._items_for_user_all = self._items_for_user_pred.join(
            self._items_for_user_true, on=self.col_user
        ).drop(self.col_user)

        return RankingMetrics(self._items_for_user_all.rdd)

    def precision_at_k(self):
        """Get precision@k.

        Note:
            More details can be found
            `on the precisionAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.precisionAt>`_.

        Return:
            float: precision at k (min=0, max=1)
        """
        return self._metrics.precisionAt(self.k)

    def recall_at_k(self):
        """Get recall@K.

        Note:
            More details can be found
            `on the recallAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.recallAt>`_.

        Return:
            float: recall at k (min=0, max=1).
        """
        return self._metrics.recallAt(self.k)

    def ndcg_at_k(self):
        """Get Normalized Discounted Cumulative Gain (NDCG)

        Note:
            More details can be found
            `on the ndcgAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.ndcgAt>`_.

        Return:
            float: nDCG at k (min=0, max=1).
        """
        return self._metrics.ndcgAt(self.k)

    def map(self):
        """Get mean average precision.

        Return:
            float: MAP (min=0, max=1).
        """
        return self._metrics.meanAveragePrecision

    def map_at_k(self):
        """Get mean average precision at k.

        Note:
            More details `on the meanAveragePrecision PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.

        Return:
            float: MAP at k (min=0, max=1).
        """
        return self._metrics.meanAveragePrecisionAt(self.k)



In [9]:

def _get_top_k_items(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    k=DEFAULT_K,
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.

    Note:
        if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        pyspark.sql.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user


def _get_relevant_items_by_threshold(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    threshold=DEFAULT_THRESHOLD,
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
        items.
    """
    items_for_user = (
        dataframe.orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(col_user, col_item, col_rating)
        .withColumn(
            col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
        )
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user


def _get_relevant_items_by_timestamp(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    k=DEFAULT_K,
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (pyspark.sql.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevant items to be filtered by the function.

    Return:
        pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(
            col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
        )
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user

In [10]:
categories = [
    "raw_review_All_Beauty",
    "raw_review_Toys_and_Games",
    "raw_review_Cell_Phones_and_Accessories",
    "raw_review_Industrial_and_Scientific",
    "raw_review_Gift_Cards",
    "raw_review_Musical_Instruments",
    "raw_review_Electronics",
    "raw_review_Handmade_Products",
    "raw_review_Arts_Crafts_and_Sewing",
    "raw_review_Baby_Products",
    "raw_review_Health_and_Household",
    "raw_review_Office_Products",
    "raw_review_Digital_Music",
    "raw_review_Grocery_and_Gourmet_Food",
    "raw_review_Sports_and_Outdoors",
    "raw_review_Home_and_Kitchen",
    "raw_review_Subscription_Boxes",
    "raw_review_Tools_and_Home_Improvement",
    "raw_review_Pet_Supplies",
    "raw_review_Video_Games",
    "raw_review_Kindle_Store",
    "raw_review_Clothing_Shoes_and_Jewelry",
    "raw_review_Patio_Lawn_and_Garden",
    "raw_review_Unknown",
    "raw_review_Books",
    "raw_review_Automotive",
    "raw_review_CDs_and_Vinyl",
    "raw_review_Beauty_and_Personal_Care",
    "raw_review_Amazon_Fashion",
    "raw_review_Magazine_Subscriptions",
    "raw_review_Software",
    "raw_review_Health_and_Personal_Care",
    "raw_review_Appliances",
    "raw_review_Movies_and_TV"
]


schema = StructType([
    StructField("user_id", StringType(), True),
    # StructField("asin", StringType(), True),
    StructField("parent_asin", StringType(), True),
    StructField("timestamp", StringType(), True), 
    StructField("rating", StringType(), True), 
])
category = '5core_rating_only_All_Beauty'
# parquet_path = f"gs://{bucket_name}/dataset/{category}/*.parquet"



In [28]:
RANK = 10
MAX_ITER = 5
REG_PARAM = 0.1
df_spark = None 

COL_USER = "user_id"
COL_ITEM = "parent_asin"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"
COL_PREDICTION = "prediction"

COL_USER_INDEX = "user_index"
COL_ITEM_INDEX = "item_index"

batch_size = 10000
total_loaded = 0
num_partitions = 8  

K = 5

In [None]:
# # Đọc dữ liệu từ Parquet

# # parquet_path = f"gs://{bucket_name}/dataset/5core_rating_only_*/*.parquet"
# # df_spark = spark.read.parquet(parquet_path).select("user_id", "parent_asin", "rating")
# # df_spark.write.mode("overwrite").parquet(f"gs://{bucket_name}/processed/5core_rating.parquet")

# df_spark = spark.read.parquet(f"gs://{bucket_name}/processed/5core_rating.parquet")
# df_spark = df_spark.persist(StorageLevel.DISK_ONLY)
# df_spark.show()




In [None]:

# Tạo indexer
COL_USER_model = StringIndexer(inputCol=COL_USER, outputCol=COL_USER_INDEX) \
    .fit(df_spark.select(COL_USER).distinct())

COL_ITEM_model = StringIndexer(inputCol=COL_ITEM, outputCol=COL_ITEM_INDEX) \
    .fit(df_spark.select(COL_ITEM).distinct())



# Save model
COL_USER_model.write().overwrite().save(f"gs://{bucket_name}/indexer_user")
COL_ITEM_model.write().overwrite().save(f"gs://{bucket_name}/indexer_item")

# COL_USER_model = StringIndexerModel.load(f"gs://team15-storage/indexer_user")
# COL_ITEM_model = StringIndexerModel.load(f"gs://team15-storage/indexer_item")
# Áp dụng các indexer cho DataFrame
df_spark = COL_USER_model.transform(df_spark)
df_spark = COL_ITEM_model.transform(df_spark)
df_spark = df_spark.withColumn("rating", df_spark["rating"].cast(FloatType()))
df_spark = df_spark.drop("user_id", "parent_asin")

df_spark.show(5)


In [None]:
df_spark.write.mode("overwrite").parquet(f"gs://{bucket_name}/dataset")

In [None]:
# df_spark = spark.read.parquet(f"gs://{bucket_name}/dataset")

# df_spark.show()

25/04/21 14:18:23 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
[Stage 1:>                                                          (0 + 1) / 1]

+------+----------+----------+
|rating|user_index|item_index|
+------+----------+----------+
|   5.0| 4067184.0| 5114046.0|
|   5.0| 4067184.0| 5066546.0|
|   5.0| 4067184.0| 2067665.0|
|   5.0| 4067184.0| 5026158.0|
|   5.0| 4067184.0| 5120623.0|
|   4.0| 5834754.0| 2379309.0|
|   4.0| 5834754.0| 3588803.0|
|   5.0| 5834754.0| 1034071.0|
|   5.0| 5834754.0| 5137683.0|
|   5.0| 5834754.0| 4445808.0|
|   5.0| 3135153.0| 1307667.0|
|   5.0| 3135153.0| 3326480.0|
|   5.0| 3135153.0| 5079509.0|
|   5.0| 3135153.0| 2862256.0|
|   4.0| 3135153.0| 4784473.0|
|   5.0| 3135153.0| 4760205.0|
|   5.0| 3135153.0| 3730944.0|
|   5.0| 3135153.0| 3734809.0|
|   5.0| 1043684.0| 2185788.0|
|   5.0| 1043684.0|  677014.0|
+------+----------+----------+
only showing top 20 rows



                                                                                

In [None]:
dfs_train, dfs_test = spark_random_split(df_spark, ratio=0.75, seed=42)

In [None]:
als = ALS(
    maxIter=MAX_ITER, 
    rank=RANK,
    regParam=REG_PARAM, 
    userCol=COL_USER_INDEX, 
    itemCol=COL_ITEM_INDEX, 
    ratingCol=COL_RATING, 
    coldStartStrategy="drop"
)
###

model = als.fit(dfs_train)

dfs_pred = model.transform(dfs_test).drop(COL_RATING)


In [None]:
bucket_name = "team15-storage"
model_path = f"gs://{bucket_name}/als_model" 
model.write().overwrite().save(model_path)
print(f"Mô hình ALS đã được lưu vào: {model_path}")

In [None]:
# Alias DataFrames
pred_alias = dfs_pred.alias("pred")
train_alias = dfs_train.alias("train")

# Join và dùng col("alias.colname") thay vì dfs[col]
dfs_pred_exclude_train = pred_alias.join(
    train_alias,
    (col("pred." + COL_USER_INDEX) == col("train." + COL_USER_INDEX)) &
    (col("pred." + COL_ITEM_INDEX) == col("train." + COL_ITEM_INDEX)),
    how='outer'
)

# Filter những cặp chưa thấy trong training
dfs_pred_final = dfs_pred_exclude_train.filter(col("train.Rating").isNull()) \
    .select(
        col("pred." + COL_USER_INDEX).alias(COL_USER_INDEX),
        col("pred." + COL_ITEM_INDEX).alias(COL_ITEM_INDEX),
        col("pred.prediction")
    )

dfs_pred_final.show()


In [25]:
# dfs_pred_final.write.mode("overwrite").parquet(f"gs://{bucket_name}/predict")
dfs_pred_final = spark.read.parquet(f"gs://{bucket_name}/predict")

dfs_pred_final.show()

25/04/21 14:19:21 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
[Stage 3:>                                                          (0 + 1) / 1]

+----------+----------+----------+
|user_index|item_index|prediction|
+----------+----------+----------+
|      16.0| 1401953.0| 4.5411286|
|      23.0| 2945575.0|  5.065992|
|      64.0| 4817381.0|  4.502164|
|      65.0|  915115.0|  4.400567|
|      69.0| 3146871.0|  4.442011|
|      74.0| 1193036.0| 2.1374714|
|      83.0| 2317153.0| 4.5951037|
|      83.0| 3623070.0|  4.737312|
|      83.0| 4244158.0|  4.670016|
|      97.0| 1308857.0|  4.599385|
|     105.0| 4919669.0| 3.1340566|
|     145.0| 4212876.0|  4.211695|
|     153.0| 1553354.0|  3.707421|
|     168.0| 1331337.0|  4.152615|
|     168.0| 2799882.0|  4.357819|
|     168.0| 3038503.0| 4.4673553|
|     182.0| 1092397.0| 2.7466314|
|     194.0| 2087264.0| 4.2487354|
|     223.0| 3776419.0|  4.800173|
|     256.0| 4165538.0| 4.7921405|
+----------+----------+----------+
only showing top 20 rows



                                                                                

In [29]:
evaluations = SparkRankingEvaluation(
    dfs_test, 
    dfs_pred_final,
    col_user=COL_USER_INDEX,
    col_item=COL_ITEM_INDEX,
    col_rating=COL_RATING,
    col_prediction=COL_PREDICTION,
    k=K
)

print(
    "Precision@k = {}".format(evaluations.precision_at_k()),
    "Recall@k = {}".format(evaluations.recall_at_k()),
    "NDCG@k = {}".format(evaluations.ndcg_at_k()),
    "Mean average precision = {}".format(evaluations.map_at_k()),
    sep="\n"
)

25/04/21 15:19:53 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
25/04/21 15:20:05 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
25/04/21 15:39:28 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
25/04/21 15:40:54 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
25/04/21 16:05:56 WARN DAGScheduler: Broadcasting large task binary with size 295.7 MiB
25/04/21 16:06:56 WARN DAGScheduler: Broadcasting large task binary with size 591.5 MiB
25/04/21 16:07:21 WARN DAGScheduler: Broadcasting large task binary with size 591.5 MiB
25/04/21 16:08:33 WARN DAGScheduler: Broadcasting large task binary with size 591.5 MiB
25/04/21 16:09:47 WARN DAGScheduler: Broadcasting large task binary with size 591.5 MiB
25/04/21 16:10:59 WARN DAGScheduler: Broadcasting large task binary with size 591.5 MiB

Precision@k = 0.6091560059645985
Recall@k = 0.827802918653073
NDCG@k = 0.9811180097721983
Mean average precision = 0.974869135836306


                                                                                

In [None]:
users = [
    2715975.0,
    541078.0,
    1717487.0,
    4142059.0,
    377882.0,
    5525177.0,
    4429053.0,
    1753193.0,
    4157055.0,
    1726869.0
]

dfs_rec_subset = model.recommendForUserSubset(users, 10)
dfs_rec_subset.show(10)

In [None]:
# cleanup spark instance
spark.stop()