In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, lit, split, array
from pyspark.ml.linalg import DenseVector
import os
import pickle
import shutil

In [11]:
spark = SparkSession.builder \
    .appName("Content-Based Filtering") \
    .getOrCreate()

In [12]:
data_path = "./data/cleaned/anime_data_cleaned.csv"
data = spark.read.csv(data_path, header=True, inferSchema=True)
data.show(5)

+--------------------+--------------------+--------------------+-------+--------------------+------+-------+----+--------------------+--------------------+-------+-------+----+------------+
|               title|                link|               image|episode|              rating| views|quality|year|              genres|         description| season|   type|rate|nums_of_vote|
+--------------------+--------------------+--------------------+-------+--------------------+------+-------+----+--------------------+--------------------+-------+-------+----+------------+
|5-toubun no Hanay...|https://animeviet...|https://cdn.anime...|      1|9.6 trong số 10 d...|235236|     HD|2024|Shounen, School, ...|Chuyến đi tuần tr...|Mùa thu|phim lẻ| 9.6|         301|
|Overlord Movie 3:...|https://animeviet...|https://cdn.anime...|      1|9.4 trong số 10 d...|687666|    CAM|2024|Fantasy, Adventur...|Vương quốc thiêng...|Mùa thu|phim lẻ| 9.4|         115|
|Digimon Adventure...|https://animeviet...|https:/

In [13]:
data = data.dropna(subset=['link', 'title', 'genres', 'rate'])
data = data.withColumnRenamed('link', 'item_id')

In [14]:
data = data.withColumn("rate", col("rate").cast("double"))

data = data.withColumn("genres", split(col("genres"), ","))

In [15]:
# Dùng CountVectorizer chuyển thể loại thành vector
vectorizer = CountVectorizer(inputCol="genres", outputCol="genre_vector")
count_vectorizer_model = vectorizer.fit(data)

In [16]:
item_indexer = StringIndexer(inputCol="item_id", outputCol="item_index")

In [17]:
preprocessing_pipeline = Pipeline(stages=[vectorizer, item_indexer])
preprocessed_pipeline_model = preprocessing_pipeline.fit(data)
preprocessed_data = preprocessed_pipeline_model.transform(data)

### xuất model

In [18]:
# Lưu mô hình đã huấn luyện
model_path = "./models/genre_vectorizer"
count_vectorizer_model.save(model_path)
print(f"Model saved to {model_path}")

Model saved to ./models/genre_vectorizer


In [19]:
def recommend_by_title(title, top_n=10):
    item_data = preprocessed_data.filter(col("title") == title).select("item_id", "genre_vector", "genres").first()

    if not item_data:
        print(f"No anime found with title: {title}")
        return

    item_id = item_data["item_id"]
    genre_vector = DenseVector(item_data["genre_vector"].toArray())

    # Broadcast vector thể loại đến tất cả các worker
    broadcast_genre_vector = spark.sparkContext.broadcast(genre_vector)

    # Tính toán độ tương đồng cosine một cách thủ công
    def calculate_similarity(row):
        target_vector = broadcast_genre_vector.value
        row_vector = DenseVector(row.genre_vector.toArray())
        dot_product = sum(target_vector[i] * row_vector[i] for i in range(len(target_vector)))
        norm_target = sum(x ** 2 for x in target_vector) ** 0.5
        norm_row = sum(x ** 2 for x in row_vector) ** 0.5
        similarity = dot_product / (norm_target * norm_row) if norm_target and norm_row else 0.0
        return row.title, row.item_id, row.genres, similarity

    # Chuyển preprocessed_data thành RDD và tính toán độ tương đồng
    similar_items_rdd = preprocessed_data.rdd.map(calculate_similarity)

    # Lọc bỏ item gốc và sắp xếp theo độ tương đồng giảm dần
    similar_items = similar_items_rdd.filter(lambda x: x[1] != item_id).takeOrdered(top_n, key=lambda x: -x[3])

    for title, item_id, genres, similarity in similar_items:
        print(f"Title: {title}, Link: {item_id}, Genres: {genres}, Similarity: {similarity}")


#### Ví dụ <br>
1. Naruto
2. Bảy Viên Ngọc Rồng Heroes
3. Gamers!
4. Hội Chứng Tuổi Thanh Xuân Movie

In [None]:
# Example usage
recommend_by_title("Naruto")