In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc, round, explode, split, trim

spark = SparkSession.builder \
    .appName("AnimeAnalysis") \
    .getOrCreate()

df = spark.read.option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .csv("anime_recommendation_dataset.csv")

df.printSchema()

def split_genres_into_tuples(row):
    title = row.title
    genres = row.genres
    if genres:
        genre_list = [genre.strip() for genre in genres.split(",")]
        return [(title, genre) for genre in genre_list]
    return []

anime_rdd = df.rdd
genre_tuples_rdd = anime_rdd.flatMap(split_genres_into_tuples)

print("=== First 10 Genre-Title Pairs ===")
first_10_genres = genre_tuples_rdd.take(10)
for title, genre in first_10_genres:
    print(f"{title}: {genre}")

unique_genres = genre_tuples_rdd.map(lambda x: x[1]).distinct().collect()
print(f"\n=== Unique Genres ({len(unique_genres)} total) ===")

unique_genres = [genre for genre in unique_genres if genre and not genre[0].isdigit()]
print(sorted(unique_genres))

genre_count_rdd = genre_tuples_rdd.map(lambda x: (x[1], 1)).reduceByKey(lambda a, b: a + b)
print("\n=== Number of Anime per Genre ===")
genre_counts = genre_count_rdd.collect()

genre_counts = [(genre, genre_count_value) for genre, genre_count_value in genre_counts if genre and not genre[0].isdigit()]
for genre, genre_count_value in sorted(genre_counts, key=lambda x: x[1], reverse=True):
    print(f"{genre}: {genre_count_value} anime")


print("\n=== Top 5 Highest Rated Anime ===")

top_rated = df.filter(col("score").isNotNull()) \
    .select("title", "score", "genres") \
    .withColumn("score_float", col("score").cast("float")) \
    .orderBy(desc("score_float")) \
    .limit(5)

top_rated.show(truncate=False)


print("\n=== Average Score per Genre ===")

genre_df = df.withColumn("genre", explode(split(col("genres"), ","))) \
    .withColumn("genre", trim(col("genre"))) \
    .filter(col("score").isNotNull()) \
    .withColumn("score_float", col("score").cast("float"))

avg_score_by_genre = genre_df.groupBy("genre") \
    .agg(round(avg("score_float"), 2).alias("average_score"),
         count("title").alias("anime_count")) \
    .orderBy(desc("average_score"))

avg_score_by_genre.show(truncate=False)

print("\n=== Average Number of Episodes per Genre ===")
episodes_by_genre = genre_df.filter(col("episodes").isNotNull()) \
    .withColumn("episodes_float", col("episodes").cast("float")) \
    .groupBy("genre") \
    .agg(round(avg("episodes_float"), 1).alias("avg_episodes"),
         count("title").alias("anime_count")) \
    .orderBy(desc("avg_episodes"))

episodes_by_genre.show(truncate=False)

print("\n=== Genre with Highest Average Score (min 5 anime) ===")
top_genre = avg_score_by_genre.filter(col("anime_count") >= 5) \
    .orderBy(desc("average_score")) \
    .first()

if top_genre:
    print(f"Genre: {top_genre['genre']}")
    print(f"Average Score: {top_genre['average_score']}")
    print(f"Number of Anime: {top_genre['anime_count']}")

print("\n=== Most Common Genre Combinations ===")
genre_combinations = df.groupBy("genres") \
    .agg(count("title").alias("count")) \
    .orderBy(desc("count")) \
    .limit(10)

genre_combinations.show(truncate=False)

spark.stop()

root
 |-- title: string (nullable = true)
 |-- synopsis: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- score: string (nullable = true)
 |-- characters: string (nullable = true)

=== First 10 Genre-Title Pairs ===
While traveling: the beautiful and ridiculously poor gambler
(Source: Anime News Network)": 1.0
Oddly enough: there's no proof that he's ever taken a life. In fact
(Source: Funimation)": 26.0
(Source: Funimation)": 26.0
Bouken Ou Beet: Adventure
Bouken Ou Beet: Fantasy
Bouken Ou Beet: Supernatural
(Source: Sentai Filmworks)": 145.0
(Source: Anime News Network)": 24.0

=== Unique Genres (111 total) ===
['Action', 'Adventure', 'Aya Mikage is a descendent of Ceres', 'Ayato Kamina meets Reika Mishima', 'Comedy', 'Dr. Tenma\'s beliefs are shaken as his actions that night are shown to have much broader consequences than he could have imagined. Leaving behind his life as a surgeon he embarks on a journey across the country