Use CV Validation to get the best hyperparameters for the LSH model:

- `bucketLength` $\in [0.5, 2.0]$ based on some small research 

- `numHashTables` $\in [1, 10]$ so the number of hash tables is not too large

- `approxSimilarityJoin` threshold $\in [0, 1.41]$ so the cosine similarity is between 0 and 1

the idea will be to create a json for keeping tracke of the parameters and like to like 10-folds but do a partial cross validation using only 5 until we get an idea of the best parameters and then 10-fold cv to get the final model

---

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, struct
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.sql import Row
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as sql_sum, col
from pyspark.ml.evaluation import RegressionEvaluator

# Treat the data

In [None]:
spark = SparkSession.builder \
    .appName("ItemItemCF") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

data = spark.read.csv("data/100k.csv", header=True, inferSchema=True) \
            .select("userId", "movieId", "rating")

In [None]:
# get 10 folds of data
folds = data.randomSplit([0.1]*10, seed=42)

# Create functions automate the CV

In [None]:
def item_item_cf_similarities(ratings):
    # get the number of unique users
    num_users = ratings.select("userId").distinct().count()

    # create the sparse vector for movie function
    def to_sparse_vector(user_ratings, size):
        # Sort by userId to get strictly increasing indices
        sorted_pairs = sorted(user_ratings, key=lambda x: x.userId)
        indices = [x.userId - 1 for x in sorted_pairs]
        values = [x.rating for x in sorted_pairs]
        return Vectors.sparse(size, indices, values)

    # group by movieId and collect user ratings
    item_user = ratings.groupBy("movieId") \
        .agg(collect_list(struct("userId", "rating")).alias("user_ratings"))

    # convert that to a sparse vector
    item_vector_rdd = item_user.rdd.map(
        lambda row: Row(
            movieId=row["movieId"],
            features=to_sparse_vector(row["user_ratings"], num_users)
        )
    )

    # convert to DataFrame because of Normalizer (MLlib)
    item_vectors = spark.createDataFrame(item_vector_rdd)

    # normalizing with L2 (Euclidean) norm (p=2)
    normalizer = Normalizer(inputCol="features", outputCol="norm_features", p=2.0)
    normalized = normalizer.transform(item_vectors)

    # create the LSH model
    lsh = BucketedRandomProjectionLSH(
        inputCol="norm_features",
        outputCol="hashes",
        bucketLength=1.5,
        numHashTables=3
    )

    # fit the model
    lsh_model = lsh.fit(normalized)

    # get the approximate neighbors
    neighbors = lsh_model.approxSimilarityJoin(
        normalized,
        normalized,
        threshold=1.0, # distance threshold
        distCol="distance"
    ).filter(col("datasetA.movieId") < col("datasetB.movieId"))  # avoid bottom triangle (reverse + self)

    # convert the distance to cosine similarity
    neighbors_cosine = neighbors.withColumn(
        "cosine_sim",
        1 - (col("distance") ** 2) / 2
    ).select(
        col("datasetA.movieId").alias("movie_i"),
        col("datasetB.movieId").alias("movie_j"),
        "cosine_sim"
    )

    # add reverse pairs: (i,j) -> (i,j) and (j,i)
    reverse = neighbors_cosine.selectExpr("movie_j as movie_i", "movie_i as movie_j", "cosine_sim")
    similarities = neighbors_cosine.union(reverse)

    return similarities

In [None]:
def item_item_cf_predictions(ratings, similarities, test):
    # get the neighbors of the target movies
    test_with_ratings = test.alias("t") \
        .join(similarities.alias("s"), col("t.movieId") == col("s.movie_i")) \
        .join(ratings.alias("r"), (col("t.userId") == col("r.userId")) & (col("s.movie_j") == col("r.movieId"))) \
        .select(
            col("t.userId"),
            col("t.movieId").alias("target_movie"),
            col("s.movie_j").alias("neighbor_movie"),
            col("s.cosine_sim"),
            col("r.rating").alias("neighbor_rating")
        )

    # get the predicted rating
    predictions = test_with_ratings.groupBy("userId", "target_movie").agg(
        (sql_sum(col("cosine_sim") * col("neighbor_rating")) / sql_sum(col("cosine_sim"))).alias("pred_rating")
    )

    final = predictions.alias("p").join(
        test.alias("t"),
        (col("p.userId") == col("t.userId")) & (col("p.target_movie") == col("t.movieId"))
    ).select(
        col("t.userId"),
        col("t.movieId"),
        col("p.pred_rating"),
        col("t.rating").alias("actual_rating")
    )

    return final

In [None]:
def item_item_cf_results(final):
    # RMSE
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="actual_rating", predictionCol="pred_rating")
    rmse = evaluator.evaluate(final)

    # MAE
    mae_evaluator = RegressionEvaluator(metricName="mae", labelCol="actual_rating", predictionCol="pred_rating")
    mae = mae_evaluator.evaluate(final)

    return rmse, mae

# CV Function