In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, lit, split, array
from pyspark.ml.linalg import DenseVector
import os
import pickle
import shutil

In [2]:
spark = SparkSession.builder \
    .appName("Content-Based Filtering") \
    .getOrCreate()

In [3]:
data_path = "./data/cleaned/anime_data_cleaned.csv"
data = spark.read.csv(data_path, header=True, inferSchema=True)
data.show(5)

+---+--------------------+--------------------+--------------------+-------+------+-------+----+--------------------+--------------------+-------+-------+----+------------+
| id|               title|                link|               image|episode| views|quality|year|              genres|         description| season|   type|rate|nums_of_vote|
+---+--------------------+--------------------+--------------------+-------+------+-------+----+--------------------+--------------------+-------+-------+----+------------+
|  1|5-toubun no Hanay...|https://animeviet...|https://cdn.anime...|      1|235236|     HD|2024|Shounen, School, ...|Chuyến đi tuần tr...|Mùa thu|phim lẻ| 9.6|         301|
|  2|Overlord Movie 3:...|https://animeviet...|https://cdn.anime...|      1|687666|    CAM|2024|Fantasy, Adventur...|Vương quốc thiêng...|Mùa thu|phim lẻ| 9.4|         115|
|  3|Digimon Adventure...|https://animeviet...|https://cdn.anime...|      1| 27447|     HD|2023|Fantasy, Drama, C...|Hai năm sau trận .

In [4]:
data = data.dropna(subset=['link', 'title', 'genres', 'rate'])
data = data.withColumnRenamed('link', 'item_id')

In [5]:
data = data.withColumn("rate", col("rate").cast("double"))

data = data.withColumn("genres", split(col("genres"), ","))

In [6]:
# Dùng CountVectorizer chuyển thể loại thành vector
vectorizer = CountVectorizer(inputCol="genres", outputCol="genre_vector")
count_vectorizer_model = vectorizer.fit(data)

In [7]:
item_indexer = StringIndexer(inputCol="item_id", outputCol="item_index")

In [8]:
preprocessing_pipeline = Pipeline(stages=[vectorizer, item_indexer])
preprocessed_pipeline_model = preprocessing_pipeline.fit(data)
preprocessed_data = preprocessed_pipeline_model.transform(data)

### xuất model

In [9]:
# # Lưu mô hình đã huấn luyện
# model_path = "./models/genre_vectorizer"
# count_vectorizer_model.save(model_path)
# print(f"Model saved to {model_path}")

In [10]:
def recommend_by_title(title, top_n=10):
    item_data = preprocessed_data.filter(col("title") == title).select("item_id", "genre_vector", "genres").first()

    if not item_data:
        print(f"No anime found with title: {title}")
        return

    item_id = item_data["item_id"]
    genre_vector = DenseVector(item_data["genre_vector"].toArray())

    broadcast_genre_vector = spark.sparkContext.broadcast(genre_vector)

    def calculate_similarity(row):
        target_vector = broadcast_genre_vector.value
        row_vector = DenseVector(row.genre_vector.toArray())
        dot_product = sum(target_vector[i] * row_vector[i] for i in range(len(target_vector)))
        norm_target = sum(x ** 2 for x in target_vector) ** 0.5
        norm_row = sum(x ** 2 for x in row_vector) ** 0.5
        similarity = dot_product / (norm_target * norm_row) if norm_target and norm_row else 0.0
        return row.title, row.item_id, row.genres, similarity

    similar_items_rdd = preprocessed_data.rdd.map(calculate_similarity)

    similar_items = similar_items_rdd.filter(lambda x: x[1] != item_id).takeOrdered(top_n, key=lambda x: -x[3])

    for title, item_id, genres, similarity in similar_items:
        print(f"Title: {title}, Link: {item_id}, Genres: {genres}, Similarity: {similarity}")


#### Ví dụ <br>
1. Naruto
2. Bảy Viên Ngọc Rồng Heroes
3. Gamers!
4. Hội Chứng Tuổi Thanh Xuân Movie

In [11]:
# Example usage
recommend_by_title("Naruto")

Title: Boruto: Đứa Con Ngỗ Nghịch Của Naruto, Link: https://animevietsub.page/phim/boruto-dua-con-ngo-nghich-cua-naruto-r1-a529/, Genres: ['Shounen', ' Super Power', ' Comedy', ' Martial Arts', ' Action'], Similarity: 0.9999999999999998
Title: Naruto: Tòa tháp bị mất, Link: https://animevietsub.page/phim/naruto-toa-thap-bi-mat-i1-a524/, Genres: ['Shounen', ' Super Power', ' Comedy', ' Martial Arts', ' Action'], Similarity: 0.9999999999999998
Title: Naruto: Người kế thừa Hỏa chí, Link: https://animevietsub.page/phim/naruto-nguoi-ke-thua-hoa-chi-i1-a523/, Genres: ['Shounen', ' Super Power', ' Comedy', ' Martial Arts', ' Action'], Similarity: 0.9999999999999998
Title: Naruto: Sức Mạnh Vĩ Thú, Link: https://animevietsub.page/phim/naruto-suc-manh-vi-thu-r2-a5/, Genres: ['Shounen', ' Super Power', ' Comedy', ' Martial Arts', ' Action'], Similarity: 0.9999999999999998
Title: Bảy Viên Ngọc Rồng Heroes, Link: https://animevietsub.page/phim/bay-vien-ngoc-rong-heroes-a3290/, Genres: ['Shounen', '

In [12]:
# Chia tập dữ liệu thành train và test
train_data, test_data = preprocessed_data.randomSplit([0.8, 0.2], seed=42)
print("Train and Test split completed.")

Train and Test split completed.


In [13]:
from pyspark.sql.functions import array_contains

def precision_at_k(test_data, k=10):
    """
    Tính Precision@K cho mô hình gợi ý.
    """
    correct_predictions = 0
    total_predictions = 0

    for title in test_data.select("title").distinct().rdd.map(lambda row: row.title).collect():
        item_data = train_data.filter(col("title") == title).select("item_id", "genre_vector").first()
        if not item_data:
            continue

        # Lấy danh sách gợi ý
        genre_vector = DenseVector(item_data["genre_vector"].toArray())
        broadcast_genre_vector = spark.sparkContext.broadcast(genre_vector)

        def calculate_similarity(row):
            target_vector = broadcast_genre_vector.value
            row_vector = DenseVector(row.genre_vector.toArray())
            dot_product = sum(target_vector[i] * row_vector[i] for i in range(len(target_vector)))
            norm_target = sum(x ** 2 for x in target_vector) ** 0.5
            norm_row = sum(x ** 2 for x in row_vector) ** 0.5
            similarity = dot_product / (norm_target * norm_row) if norm_target and norm_row else 0.0
            return row.title, similarity

        recommendations = train_data.rdd.map(calculate_similarity) \
                                        .filter(lambda x: x[0] != title) \
                                        .takeOrdered(k, key=lambda x: -x[1])
        
        # Danh sách kết quả gợi ý
        recommended_titles = [rec[0] for rec in recommendations]

        # Kiểm tra các anime trong test_data có được gợi ý hay không
        test_items = test_data.filter(col("title") == title).select("genres").rdd.flatMap(lambda x: x.genres).collect()
        for rec_title in recommended_titles:
            rec_genres = train_data.filter(col("title") == rec_title).select("genres").rdd.flatMap(lambda x: x.genres).collect()
            if any(genre in test_items for genre in rec_genres):
                correct_predictions += 1

        total_predictions += len(recommended_titles)

    precision = correct_predictions / total_predictions if total_predictions > 0 else 0.0
    print(f"Precision@{k}: {precision:.4f}")
    return precision

# Đánh giá Precision@10
precision_at_k(test_data, k=10)


Precision@10: 0.7500


0.75