In [1]:
# Collaborative Filtering for Anime Data (Custom Numpy Implementation)

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Load data
ratings_df = pd.read_csv(r"C:\Users\MANAV\PycharmProjects\PythonProject\PinnacleEndTerm\data\ratings.csv")
anime_df = pd.read_csv(r"C:\Users\MANAV\PycharmProjects\PythonProject\PinnacleEndTerm\data\anime.csv")
users_df = pd.read_csv(r"C:\Users\MANAV\PycharmProjects\PythonProject\PinnacleEndTerm\data\users.csv")

In [3]:
# Filter necessary data (reduce size for faster training)
filtered_df = ratings_df[(ratings_df['adjusted_rating'] > 0)].sample(n=5000, random_state=42)

In [4]:
# Encode usernames and anime IDs
user_to_index = {u: i for i, u in enumerate(filtered_df['username'].unique())}
anime_to_index = {a: i for i, a in enumerate(filtered_df['anime_id'].unique())}

filtered_df['user_index'] = filtered_df['username'].map(user_to_index)
filtered_df['anime_index'] = filtered_df['anime_id'].map(anime_to_index)

num_users = len(user_to_index)
num_anime = len(anime_to_index)

In [5]:
# Create rating matrix Y and indicator matrix R
Y = np.zeros((num_anime, num_users))
R = np.zeros((num_anime, num_users))

for row in filtered_df.itertuples():
    Y[row.anime_index, row.user_index] = row.adjusted_rating
    R[row.anime_index, row.user_index] = 1

In [6]:
# Select genre columns
genre_columns = [col for col in anime_df.columns if col in [
    'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem',
    'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
    'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo',
    'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural',
    'Thriller', 'Vampire']]

In [7]:
# Build genre matrix for anime features
anime_df['anime_index'] = anime_df['anime_id'].map(anime_to_index)
genre_matrix = anime_df.set_index('anime_index').reindex(range(num_anime))[genre_columns].fillna(0).astype(float)

In [8]:
# Use genre features as initial X, optionally add learnable noise features
genre_features = genre_matrix.values
extra_features = 5
X_learned = np.random.randn(num_anime, extra_features)
X = np.hstack([genre_features, X_learned])
num_features = X.shape[1]

In [9]:
# Initialize user features and bias
W = np.random.randn(num_users, num_features)
b = np.zeros((1, num_users))

In [10]:
# Cost function
def cofi_cost_func(X, W, b, Y, R, lambda_):
    J = 0
    num_movies, num_users = Y.shape
    for i in range(num_movies):
        for j in range(num_users):
            if R[i, j]:
                pred = np.dot(W[j, :], X[i, :]) + b[0, j]
                J += (pred - Y[i, j]) ** 2
    J = J / 2
    J += (lambda_ / 2) * (np.sum(W ** 2) + np.sum(X ** 2))
    return J

In [11]:
# Gradient Descent function
def gradient_descent(X, W, b, Y, R, lambda_, alpha, num_iters):
    for _ in range(num_iters):
        for i in range(Y.shape[0]):
            for j in range(Y.shape[1]):
                if R[i, j]:
                    error = np.dot(W[j, :], X[i, :]) + b[0, j] - Y[i, j]
                    for k in range(num_features):
                        X[i, k] -= alpha * (error * W[j, k] + lambda_ * X[i, k])
                        W[j, k] -= alpha * (error * X[i, k] + lambda_ * W[j, k])
                    b[0, j] -= alpha * error
    return X, W, b

In [12]:
# Train the model
lambda_ = 0.1
alpha = 0.005
num_iters = 30
X, W, b = gradient_descent(X, W, b, Y, R, lambda_, alpha, num_iters)

In [13]:
# Predictions
predictions = X @ W.T + b

In [14]:
# Evaluate on known entries
y_true = Y[R == 1]
y_pred = predictions[R == 1]

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

MAE: 0.1227
MSE: 0.0187
RMSE: 0.1368


In [15]:
import pickle

In [16]:
# Save the model components
with open("anime_cf_model.pkl", "wb") as f:
    pickle.dump({
        'X': X,
        'W': W,
        'b': b,
        'user_to_index': user_to_index,
        'anime_to_index': anime_to_index,
        'genre_columns': genre_columns
    }, f)

print("Model saved to anime_cf_model.pkl")

Model saved to anime_cf_model.pkl


Testing the model with pickle

In [17]:
# Load the model components
with open("anime_cf_model.pkl", "rb") as f:
    model_data = pickle.load(f)

X = model_data['X']
W = model_data['W']
b = model_data['b']
user_to_index = model_data['user_to_index']
anime_to_index = model_data['anime_to_index']

In [18]:
# Invert anime index for mapping back to anime IDs
inv_anime_index = {v: k for k, v in anime_to_index.items()}

In [19]:
def recommend_anime_for_user(username, X, W, b, user_to_index, anime_to_index, top_n=10):
    if username not in user_to_index:
        raise ValueError("User not found in training data.")

    user_idx = user_to_index[username]
    user_pred = X @ W[user_idx].T + b[0, user_idx]
    top_anime_indices = np.argsort(-user_pred)[:top_n]
    recommended_ids = [inv_anime_index[i] for i in top_anime_indices]

    recommended_titles = anime_df[anime_df['anime_id'].isin(recommended_ids)][['anime_id', 'title']]
    # Sort titles in the order of prediction ranking
    recommended_titles['rank'] = recommended_titles['anime_id'].apply(lambda x: recommended_ids.index(x))
    return recommended_titles.sort_values('rank')[['title']]

In [20]:
import random
# Pick a random user
random_user = random.choice(list(user_to_index.keys()))
print(f"Generating recommendations for random user: {random_user}")

# Get recommendations
recommendations = recommend_anime_for_user(random_user, X, W, b, user_to_index, anime_to_index)
print("Top recommendations:")
print(recommendations.to_string(index=False))

Generating recommendations for random user: leSange
Top recommendations:
                        title
       Yojouhan Shinwa Taikei
               Yuri!!! on Ice
Higurashi no Naku Koro ni Kai
             Kotonoha no Niwa
      Shigatsu wa Kimi no Uso
           Maoyuu Maou Yuusha
             Gatchaman Crowds
              NHK ni Youkoso!
                   Death Note
       Katekyo Hitman Reborn!
