<a href="https://colab.research.google.com/github/ItsRickyBoi/Anime-Recommender/blob/main/Anime_Recommender_with_N_fold_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-surprise




In [None]:
!pip install opendatasets



In [None]:
import opendatasets as od

In [None]:
od.download("https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database")

Skipping, found downloaded files in "./anime-recommendations-database" (use force=True to force download)


In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, NormalPredictor, KNNBasic, KNNWithMeans, SVD, accuracy
from sklearn.neighbors import NearestNeighbors
from surprise.model_selection import cross_validate, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

anime = pd.read_csv('anime-recommendations-database/anime.csv', index_col="anime_id")
ratings = pd.read_csv('anime-recommendations-database/rating.csv')

display(ratings.sample(5, random_state=8))

print(f"""Number of total ratings: {ratings.shape[0]}.
Number of different users: {ratings.user_id.nunique()}.
Number of different animes: {ratings.anime_id.nunique()}.""")

Unnamed: 0,user_id,anime_id,rating
4454487,42010,10536,-1
6673063,61547,20031,6
7566588,70874,6547,10
2048209,19831,19769,7
6175887,57778,732,9


Number of total ratings: 7813737. 
Number of different users: 73515. 
Number of different animes: 11200.


In [None]:
anime.head()

Unnamed: 0_level_0,name,genre,type,episodes,rating,members
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


Ratingnya naik dari 1 menjadi 10. Dimana 1 berarti pengguna sama sekali tidak menyukai anime tersebut. Dan 10 berarti pengguna sangat menyukainya.

Terdapat rating khusus -1 yang mewakili pengguna yang telah menonton anime tetapi tidak memberi rating.

In [None]:
# Clean data: delete user with only -1 rating
valid_ratings = ratings[ratings.rating > 0]
users_with_valid_ratings = valid_ratings['user_id'].unique()
cleaned_ratings = ratings[ratings.user_id.isin(users_with_valid_ratings)].iloc[:50000, :]

# data for surprise
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(cleaned_ratings[['user_id', 'anime_id', 'rating']], reader)

In [None]:
# SVD model
svd = SVD()

# find pair user - anime
def get_common_anime_for_users(cleaned_ratings, n_users=5, n_animes=5):
    users_with_ratings = cleaned_ratings.groupby('user_id').filter(lambda x: len(x) >= n_animes)['user_id'].unique()
    random_users = np.random.choice(users_with_ratings, n_users, replace=False)

    # Find animes that all selected users have rated
    common_animes = set(cleaned_ratings[cleaned_ratings['user_id'] == random_users[0]]['anime_id'])
    for user in random_users[1:]:
        common_animes &= set(cleaned_ratings[cleaned_ratings['user_id'] == user]['anime_id'])

    # llop until found enough user - anime pair
    while len(common_animes) < n_animes:
        random_users = np.random.choice(users_with_ratings, n_users, replace=False)
        common_animes = set(cleaned_ratings[cleaned_ratings['user_id'] == random_users[0]]['anime_id'])
        for user in random_users[1:]:
            common_animes &= set(cleaned_ratings[cleaned_ratings['user_id'] == user]['anime_id'])

    random_animes = np.random.choice(list(common_animes), n_animes, replace=False)
    return random_users, random_animes


In [None]:
# Select random users and animes
random_users, random_animes = get_common_anime_for_users(cleaned_ratings)

# Dictionary for true rating
true_ratings_dict = {(user, anime): cleaned_ratings[(cleaned_ratings.user_id == user) & (cleaned_ratings.anime_id == anime)].rating.values[0] for user in random_users for anime in random_animes}


In [None]:
# Perform cross-validation
kf = KFold(n_splits=5)
rmse_list = []
mse_list = []
mae_list = []

# Store predictions for each fold
fold_predictions = []

In [None]:
for fold_index, (trainset, testset) in enumerate(kf.split(data)):
    # Train model
    svd.fit(trainset)

    # list for user-anime prediction
    predictions = []
    true_ratings = []

    for anime_id in random_animes:
        for user_id in random_users:
            # Use the pre-selected true rating
            true_rating = true_ratings_dict.get((user_id, anime_id), None)

            if true_rating is not None:
                pred = svd.predict(user_id, anime_id).est
                predictions.append(pred)
                true_ratings.append(true_rating)

    # append fold predictions
    fold_predictions.append((true_ratings, predictions))

    # Print predictions vs true ratings
    print(f"Fold {fold_index + 1}:")
    if true_ratings:
        for i in range(len(predictions)):
            print(f"User: {random_users[i % len(random_users)]}, Anime: {random_animes[i // len(random_users)]}, True Rating: {true_ratings[i]}, Predicted Rating: {predictions[i]}")
    else:
        print("No valid ratings found.")

    # Calculate errors for true rating
    if true_ratings:
        rmse_list.append(mean_squared_error(true_ratings, predictions, squared=False))
        mse_list.append(mean_squared_error(true_ratings, predictions))
        mae_list.append(mean_absolute_error(true_ratings, predictions))

# Check if the error lists not empty
if rmse_list and mse_list and mae_list:
    # check mean errors di all folds
    mean_rmse = np.mean(rmse_list)
    mean_mse = np.mean(mse_list)
    mean_mae = np.mean(mae_list)

    print(f"Mean RMSE: {mean_rmse:.4f}")
    print(f"Mean MSE: {mean_mse:.4f}")
    print(f"Mean MAE: {mean_mae:.4f}")
else:
    print("Tidak ada true rating dari koneksi antara user dan anime yang ditemuka di test set.")

print(f"Random Users: {random_users}")
print(f"Random Animes: {random_animes}")

Fold 1:
User: 345, Anime: 3588, True Rating: 8, Predicted Rating: 8.545949478147506
User: 446, Anime: 3588, True Rating: 6, Predicted Rating: 6.053021302437694
User: 273, Anime: 3588, True Rating: 8, Predicted Rating: 6.412610858466069
User: 540, Anime: 3588, True Rating: 10, Predicted Rating: 8.06357909382155
User: 170, Anime: 3588, True Rating: 10, Predicted Rating: 9.731624426163991
User: 345, Anime: 1535, True Rating: 10, Predicted Rating: 10
User: 446, Anime: 1535, True Rating: 9, Predicted Rating: 9.289491393715812
User: 273, Anime: 1535, True Rating: 8, Predicted Rating: 8.408063185613997
User: 540, Anime: 1535, True Rating: 10, Predicted Rating: 9.400796780442235
User: 170, Anime: 1535, True Rating: 9, Predicted Rating: 9.586445587402567
User: 345, Anime: 10620, True Rating: 8, Predicted Rating: 7.873252757995722
User: 446, Anime: 10620, True Rating: 7, Predicted Rating: 6.852732820766009
User: 273, Anime: 10620, True Rating: 8, Predicted Rating: 7.567253466783681
User: 540, An

**Rata-rata RMSE** : rata-rata, prediksi tersebut berjarak sekitar X poin peringkat dari peringkat sebenarnya.

**Rata-rata MSE** : Rata-rata, perbedaan kuadrat antara peringkat yang diprediksi dan peringkat sebenarnya adalah X. Ini adalah ukuran seberapa tersebarnya prediksi dari rating sebenarnya.

**Rata-rata MAE** : Rata-rata, prediksi tersebut berjarak sekitar X poin peringkat dari peringkat sebenarnya secara absolut.

In [None]:
# Function to get user's top 5 favorite genres based on true ratings
def get_favorite_genres(user_id, cleaned_ratings, anime, min_rating=7, top_n=5):
    user_ratings = cleaned_ratings[cleaned_ratings.user_id == user_id]
    high_rated_animes = user_ratings[user_ratings.rating >= min_rating]
    favorite_genres = anime.loc[high_rated_animes.anime_id]['genre'].str.split(', ').explode().value_counts().head(top_n).index.tolist()
    return favorite_genres

In [None]:
# Function to recommend animes based on favorite genres and predict ratings
def recommend_and_predict_animes(user_id, svd, anime, cleaned_ratings, favorite_genres, top_n=5):
    # Predict ratings for all animes the user has not rated yet within the favorite genres
    all_animes = anime.index
    rated_animes = cleaned_ratings[cleaned_ratings.user_id == user_id].anime_id
    animes_to_predict = set(all_animes) - set(rated_animes)

    predictions = []
    for anime_id in animes_to_predict:
        anime_genres = anime.loc[anime_id, 'genre']
        if not isinstance(anime_genres, str):
            continue
        if any(genre in anime_genres for genre in favorite_genres):
            pred = svd.predict(user_id, anime_id).est
            predictions.append((anime_id, pred))

    # Sort the predictions by rating in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get the top_n anime recommendations
    top_animes = predictions[:top_n]
    if len(top_animes) < top_n:
        return []

    return top_animes

In [None]:
# Function to find a valid user who meets the criteria
def find_valid_user(cleaned_ratings, anime, top_n=5, min_rating=7):
    users_checked = set()
    valid_user = None

    while len(users_checked) < len(cleaned_ratings['user_id'].unique()):
        user_id = np.random.choice(cleaned_ratings['user_id'].unique())
        if user_id in users_checked:
            continue

        users_checked.add(user_id)
        favorite_genres = get_favorite_genres(user_id, cleaned_ratings, anime, min_rating, top_n)

        if not favorite_genres:
            continue

        top_animes = recommend_and_predict_animes(user_id, svd, anime, cleaned_ratings, favorite_genres, top_n)

        if top_animes:
            valid_user = user_id
            break

    return valid_user, favorite_genres

In [None]:
# Function to test recommendations and calculate errors using N-fold cross-validation
def test_recommendations_by_genre_cv(data, anime, cleaned_ratings, top_n=5, n_splits=5, min_rating=7):
    kf = KFold(n_splits=n_splits)

    user_rmse_list = []
    user_mse_list = []
    user_mae_list = []

    valid_user, favorite_genres = find_valid_user(cleaned_ratings, anime, top_n, min_rating)

    if not valid_user:
        print("No valid user found.")
        return


    best_fold_index = -1
    best_fold_rmse = float('inf')
    best_fold_mse = float('inf')
    best_fold_mae = float('inf')
    best_fold_recommendations = []

    for fold_index, (trainset, testset) in enumerate(kf.split(data)):
        # Train the model
        svd.fit(trainset)

        # Recommend animes based on favorite genres
        top_animes = recommend_and_predict_animes(valid_user, svd, anime, cleaned_ratings, favorite_genres, top_n)
        if not top_animes:
            continue

        top_anime_ids = [anime_id for anime_id, _ in top_animes]
        top_anime_genres = [anime.loc[anime_id, 'genre'] for anime_id in top_anime_ids]

        # Predict ratings and find true ratings for the recommended animes
        true_ratings = []
        predicted_ratings = []

        for anime_id in top_anime_ids:
            similar_animes = cleaned_ratings[(cleaned_ratings.anime_id.isin(top_anime_ids)) & (cleaned_ratings.user_id != valid_user)]
            true_rating = similar_animes['rating'].mean() if not similar_animes.empty else None

            if true_rating is not None:
                predicted_rating = svd.predict(valid_user, anime_id).est
                true_ratings.append(true_rating)
                predicted_ratings.append(predicted_rating)

        # Calculate errors for the current fold
        if true_ratings:
            rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
            mse = mean_squared_error(true_ratings, predicted_ratings)
            mae = mean_absolute_error(true_ratings, predicted_ratings)

            user_rmse_list.append(rmse)
            user_mse_list.append(mse)
            user_mae_list.append(mae)

            # Update best fold based on RMSE, MSE, and MAE
            if rmse < best_fold_rmse and mse < best_fold_mse and mae < best_fold_mae:
                best_fold_rmse = rmse
                best_fold_mse = mse
                best_fold_mae = mae
                best_fold_index = fold_index
                best_fold_recommendations = list(zip(top_anime_ids, top_anime_genres, predicted_ratings, true_ratings))

            # Print fold errors
            print(f"\nFold {fold_index + 1}: Top {top_n} anime recommendations for user {valid_user} based on favorite genres:")
            for i, (anime_id, genre, predicted_rating, true_rating) in enumerate(best_fold_recommendations):
                print(f"{i+1}. Anime ID: {anime_id}, Genre: {genre}, Predicted Rating: {predicted_rating:.2f}")

            print(f"\nErrors for user {valid_user} on top {top_n} recommendations:")
            print(f"RMSE: {rmse:.4f}")
            print(f"MSE: {mse:.4f}")
            print(f"MAE: {mae:.4f}")
        else:
            print(f"\nFold {fold_index + 1}: No valid ratings found for this fold.")

    # Calculate and print mean errors across all folds
    if user_rmse_list and user_mse_list and user_mae_list:
        mean_rmse = np.mean(user_rmse_list)
        mean_mse = np.mean(user_mse_list)
        mean_mae = np.mean(user_mae_list)

        print(f"\nMean Errors for user {valid_user} across {kf.n_splits} folds on top {top_n} recommendations:")
        print(f"Mean RMSE: {mean_rmse:.4f}")
        print(f"Mean MSE: {mean_mse:.4f}")
        print(f"Mean MAE: {mean_mae:.4f}")

        # Print best fold recommendations
        print(f"\nBest fold was fold {best_fold_index + 1} with RMSE: {best_fold_rmse:.4f}, MSE: {best_fold_mse:.4f}, MAE: {best_fold_mae:.4f}")
        print(f"Final top {top_n} anime recommendations for user {valid_user}:")
        for i, (anime_id, genre, predicted_rating, true_rating) in enumerate(best_fold_recommendations):
            print(f"{i+1}. Anime ID: {anime_id}, Genre: {genre}, Predicted Rating: {predicted_rating:.2f}")
    else:
        print(f"No true ratings found for user {valid_user} in the test sets for the top {top_n} recommendations.")

    return valid_user, favorite_genres



In [None]:
# Example: Recommend animes for a specific user and evaluate the recommendations
chosen_user, favorite_genres = test_recommendations_by_genre_cv(data, anime, cleaned_ratings, top_n=5)


Fold 1: Top 5 anime recommendations for user 300 based on favorite genres:
1. Anime ID: 4181, Genre: Drama, Fantasy, Romance, Slice of Life, Supernatural, Predicted Rating: 10.00
2. Anime ID: 11741, Genre: Action, Fantasy, Supernatural, Thriller, Predicted Rating: 10.00
3. Anime ID: 16498, Genre: Action, Drama, Fantasy, Shounen, Super Power, Predicted Rating: 10.00
4. Anime ID: 6746, Genre: Action, Mystery, Supernatural, Predicted Rating: 9.98
5. Anime ID: 431, Genre: Adventure, Drama, Fantasy, Romance, Predicted Rating: 9.98

Errors for user 300 on top 5 recommendations:
RMSE: 1.9205
MSE: 3.6884
MAE: 1.9205

Fold 2: Top 5 anime recommendations for user 300 based on favorite genres:
1. Anime ID: 2001, Genre: Action, Adventure, Comedy, Mecha, Sci-Fi, Predicted Rating: 10.00
2. Anime ID: 431, Genre: Adventure, Drama, Fantasy, Romance, Predicted Rating: 9.81
3. Anime ID: 11061, Genre: Action, Adventure, Shounen, Super Power, Predicted Rating: 9.70
4. Anime ID: 11123, Genre: Comedy, Drama

In [None]:
# Function to print the chosen user and their favorite genres
def print_chosen_user_and_genres(user_id, favorite_genres):
    print(f"Chosen User ID: {user_id}")
    print(f"Favorite Genres: {', '.join(favorite_genres)}")

# Print the chosen user and their favorite genres
print_chosen_user_and_genres(chosen_user, favorite_genres)


Chosen User ID: 300
Favorite Genres: Comedy, Romance, Ecchi, Harem, Action


In [None]:
# Function to get the animes rated by a specific user
def get_user_ratings(user_id):
    user_ratings = cleaned_ratings[cleaned_ratings['user_id'] == user_id]
    return user_ratings.merge(anime, left_on='anime_id', right_index=True)

# Function to get a specific user's rating for a specific anime
def get_user_anime_rating(user_id, anime_id):
    user_ratings = get_user_ratings(user_id)
    specific_rating = user_ratings[user_ratings['anime_id'] == anime_id]
    return specific_rating

#Get the animes rated by a specific user
example_user_id = 440 #User's ID
user_ratings = get_user_ratings(example_user_id)

#Get a specific user's rating for a specific anime
example_anime_id = 4181 #Anime's ID
user_anime_rating = get_user_anime_rating(example_user_id, example_anime_id)

# Display the specific rating
print(f"Rating by user {example_user_id} for anime {example_anime_id}:")
display(user_anime_rating)

Rating by user 440 for anime 4181:


Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members


In [None]:
def print_anime_details(anime_id, anime_df):
    if anime_id in anime_df.index:
        anime_details = anime_df.loc[anime_id]
        print(f"Anime ID: {anime_id}")
        print(f"Title: {anime_details['name']}")
        print(f"Genre: {anime_details['genre']}")
        print(f"Type: {anime_details['type']}")
        print(f"Episodes: {anime_details['episodes']}")
        print(f"Rating: {anime_details['rating']}")
    else:
        print(f"Anime with ID {anime_id} not found.")

example_anime_id = 4181
print_anime_details(example_anime_id, anime)

Anime ID: 4181
Title: Clannad: After Story
Genre: Drama, Fantasy, Romance, Slice of Life, Supernatural
Type: TV
Episodes: 24
Rating: 9.06
