In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import pandas as pd

In [None]:
spark = SparkSession.builder \
    .appName("Anime Recommendation System") \
    .getOrCreate()

In [None]:
# Load the anime dataset
def load_anime_data(filepath):
    df = spark.read.csv(filepath, header=True, inferSchema=True)
    
    ratings_df = df.select("user_id", "anime_id", "rating")
    
    ratings_df = ratings_df.withColumn("rating", col("rating").cast("float"))
    
    return df, ratings_df

Split the data into training and test sets

In [None]:
def split_data(ratings_df, train_ratio=0.8):
    return ratings_df.randomSplit([train_ratio, 1 - train_ratio], seed=42)

Create & train ALS Model

We decided to use ALS model due to these given reasons:

ALS is a matrix factorization algorithm designed for large-scale recommendation systems. It’s particularly useful when:

- The dataset is sparse (i.e., most users have rated only a small fraction of all available items).
- There are implicit or explicit ratings (ALS can handle both).
- Scalability is a concern (Spark’s ALS is optimized for distributed computing).
- Given that the Anime Dataset (2023) consists of user ratings for anime titles, ALS is a strong choice because:

1. It can generalize well to unseen users and items by learning latent factors.
2. It works well with sparse data, which is common in recommendation problems.
3. It’s optimized for large datasets, making it a good fit for Spark.

Compared to other methods:
- User-based or item-based collaborative filtering (kNN-based methods) don’t scale well for large datasets.
- Content-based filtering doesn’t generalize well if metadata is missing or inconsistent.
- ALS balances scalability and predictive performance better than most traditional models.

In [None]:
def train_als_model(train_df, max_iter=10, reg_param=0.1, rank=10):
    als = ALS(
        maxIter=max_iter,
        regParam=reg_param,
        rank=rank,
        userCol="user_id",
        itemCol="anime_id",
        ratingCol="rating",
        coldStartStrategy="drop",  # Handle missing values by dropping them during evaluation
        nonnegative=True  # Constrain the factors to be non-negative
    )
    
    model = als.fit(train_df)
    
    return model

Model evaluation

In [None]:
def evaluate_model(model, test_df):
    predictions = model.transform(test_df)
    
    # Drop NaN values that might have been introduced
    predictions = predictions.na.drop()
    
    # Evaluate using RMSE
    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="rating",
        predictionCol="prediction"
    )
    rmse = evaluator.evaluate(predictions)
    
    return rmse, predictions

Generate recommendations for a user

In [1]:
def get_user_recommendations(model, user_id, num_recommendations=10):
    # Get top N recommendations for the user
    user_recs = model.recommendForUserSubset(
        spark.createDataFrame([(user_id,)], ["user_id"]),
        num_recommendations
    )
    
    return user_recs


Generate recommendations for an anime

In [None]:
def get_anime_recommendations(model, anime_id, num_recommendations=10):
    # Get top N users
    anime_recs = model.recommendForItemSubset(
        spark.createDataFrame([(anime_id,)], ["anime_id"]),
        num_recommendations
    )
    
    return anime_recs

Main function to run the recommendation system

In [None]:
# Main function to run the recommendation system
def run_anime_recommendation_system(filepath, user_id_to_recommend=None):
    print("Loading data...")
    full_df, ratings_df = load_anime_data(filepath)
    
    print(f"Total ratings: {ratings_df.count()}")
    print(f"Unique users: {ratings_df.select('user_id').distinct().count()}")
    print(f"Unique anime: {ratings_df.select('anime_id').distinct().count()}")
    
    print("Splitting data into training and test sets...")
    train_df, test_df = split_data(ratings_df)
    
    print("Training ALS model...")
    model = train_als_model(train_df)
    
    print("Evaluating model...")
    rmse, predictions = evaluate_model(model, test_df)
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    
    if user_id_to_recommend:
        print(f"Generating recommendations for user {user_id_to_recommend}...")
        user_recs = get_user_recommendations(model, user_id_to_recommend)
        
        user_recs_pd = user_recs.toPandas()
        
        # Join with anime names
        if 'anime_name' in full_df.columns:
            anime_names = full_df.select("anime_id", "anime_name").distinct()
            
            # Extract recommendations from the nested structure
            from pyspark.sql.functions import explode
            user_recs_exploded = user_recs.select(
                "user_id", 
                explode("recommendations").alias("rec")
            )
            user_recs_flattened = user_recs_exploded.select(
                "user_id", 
                col("rec.anime_id").alias("anime_id"), 
                col("rec.rating").alias("predicted_rating")
            )
            
            # Join with anime names
            user_recs_with_names = user_recs_flattened.join(
                anime_names, 
                on="anime_id"
            ).orderBy(col("predicted_rating").desc())
            
            print("Top recommended anime:")
            user_recs_with_names.show(10, truncate=False)
    
    return model, full_df, ratings_df

Example use case

In [None]:
if __name__ == "__main__":
    # Placeholder for path
    filepath = "anime_ratings_2023.csv"
    
    model, full_df, ratings_df = run_anime_recommendation_system(filepath, user_id_to_recommend=123)
    
    # Optional: Save the model
    model.save("anime_als_model")
    
    spark.stop()

Completed tasks:
- Setting up the recommendation system models
- Read datasets and train/test split 
- Explore the reason to use the ALS model 

Remaining tasks:
- Integrating Spark with Redis database
- Solve the problem with real-time data on Redis while using ALS with micro-batch processing