# Collaborative Filtering Model without factor models

In [11]:
import pandas as pd
import math

In [12]:
# Load the data
anime_df = pd.read_csv('assignment_2_data/assignment_2_anime.csv')
test_df = pd.read_csv('assignment_2_data/assignment_2_ratings_test.csv')
train_df = pd.read_csv('assignment_2_data/assignment_2_ratings_train.csv')

anime_mean_ratings = train_df.groupby('anime_id')['rating'].mean()

# filling the missing values in test_df with the average rating of each anime
test_df['rating'] = test_df.apply(lambda row: anime_mean_ratings[row['anime_id']]
                                                if pd.isnull(row['rating']) else row['rating'], axis=1)

# Compute the mean rating for each user in the training set
user_mean_ratings = train_df.groupby('user_id')['rating'].mean()

# Predict the ratings for each user & anime in the test set
predictions = []
for i, row in test_df.iterrows():
    user_id = row['user_id']
    anime_id = row['anime_id']
    if user_id in user_mean_ratings:
        predictions.append(user_mean_ratings[user_id])
    else:
        predictions.append(anime_mean_ratings[anime_id])

In [13]:
# Compute the MSE
mse = ((test_df['rating'] - predictions) ** 2).mean()
rmse = math.sqrt(mse)
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

MSE: 1.8632767692499541
RMSE: 1.365018962963502


# Singular Value Decomposition Model

In [2]:
# Import necessary libraries
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error


In [3]:
anime_df = pd.read_csv('assignment_2_data/assignment_2_anime.csv')
test_df = pd.read_csv('assignment_2_data/assignment_2_ratings_test.csv')
train_df = pd.read_csv('assignment_2_data/assignment_2_ratings_train.csv')

test_rating = test_df['rating']
test_df_user = test_df[['user_id', 'anime_id']]

In [4]:
# Define the rating scale based on anime review scores 1-10
reader = Reader(rating_scale=(1,10))

In [5]:
# Using surprise library to alter data into usable format for SVD  
train_data = Dataset.load_from_df(train_df[['user_id', 'anime_id', 'rating']], reader)
test_data = Dataset.load_from_df(test_df[['user_id', 'anime_id', 'rating']], reader)

In [6]:
# Split the data into training and validation sets
trainset, valset = train_test_split(train_data, test_size=0.2, random_state=42)

In [7]:
# Training model using SVD algorithm
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe1ba6622e0>

In [8]:
# Evaluate the model on the validation set
val_predictions = algo.test(valset)
val_rmse = accuracy.rmse(val_predictions)
val_mse = accuracy.mse(val_predictions)

RMSE: 1.1704
MSE: 1.3698


In [9]:
# Evaluate the model on the test set
test_predictions = algo.test(test_data.build_full_trainset().build_testset())
test_rmse = accuracy.rmse(test_predictions)
test_mse = accuracy.mse(test_predictions)

RMSE: 1.1714
MSE: 1.3721


In [10]:
# Print the RMSE score for validation and test sets
print("Validation RMSE score:", val_rmse)
print("Test RMSE score:", test_rmse)

# Print the MSE score for validation and test sets
print("Validation MSE score:", val_mse)
print("Test MSE score:", test_mse)

Validation RMSE score: 1.1703766064543302
Test RMSE score: 1.1713597838288727
Validation MSE score: 1.369781400935554
Test MSE score: 1.3720837431716233


# Matrix Factorisation Model (Runs very slowly)

In [4]:
import numpy as np
import pandas as pd
import pylab as plt
import copy
import time
import random
import surprise
from surprise import SVD, Reader, Dataset
import math
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [5]:
anime_df = pd.read_csv('assignment_2_data/assignment_2_anime.csv')
test_df = pd.read_csv('assignment_2_data/assignment_2_ratings_test.csv')
train_df = pd.read_csv('assignment_2_data/assignment_2_ratings_train.csv')
train_df, train_test_df = train_test_split(train_df, test_size = 0.7, random_state = 42)

test_rating = test_df['rating']
test_df_user = test_df[['user_id', 'anime_id']]

train_df = train_df.groupby(['user_id', 'anime_id']).mean().reset_index()
ratings_matrix = train_df.pivot(index='user_id', columns='anime_id', values='rating')

anime_df['genre'] = anime_df['genre'].str.split(', ')

anime_df =anime_df.sort_values(by = ['anime_id'])

In [6]:
anime_df.loc[anime_df['genre'].isnull(),'genre'] = anime_df.loc[anime_df['genre'].isnull(),'genre'].apply(lambda x: [])

In [7]:
encoder = MultiLabelBinarizer()
encoded_genres = encoder.fit_transform(anime_df['genre'])

encoded_new = encoded_genres[anime_df['anime_id'].isin(ratings_matrix.keys())]
encoded_new = pd.DataFrame(encoded_new)

In [8]:
def matrix_factorization(ratings, users, genres, K, steps=5000, alpha=0.0002, beta=0.02):
    '''
    ratings: Ratings Matrix
    users: User Matrix
    genres: Genres Matrix
    K: Number of Genres
    steps: iterations
    alpha: learning rate
    beta: regularization parameter
    '''

    for step in range(steps):
      for i in ratings.keys(): # Each anime film
        for j in ratings[i].keys(): # Each user
          if math.isnan(ratings[i][j]) == False: # check if there is a rating
            i_loc = np.where(ratings.keys() == i)[0][0] # locating the anime index in the ratings matrix
            j_loc = np.where(ratings[i].keys() == j)[0][0] # locating the user index in the ratings matrix
            
            # calculate the error
            eij = ratings[i][j] - np.dot(users[j_loc,:], genres.iloc[i_loc])

            for k in range(K):
                # calculate gradient with a and beta parameter
                users[j_loc][k] = users[j_loc][k] + alpha * (2 * eij * genres.iloc[i_loc][k] - beta * users[j_loc][k])
                genres.iloc[i_loc][k] = genres.iloc[i_loc][k] + alpha * (2 * eij * users[j_loc][k] - beta * genres.iloc[i_loc][k])

      e = 0

      for i in ratings.keys(): # Each anime film
        for j in ratings[i].keys(): # Each user

          if math.isnan(ratings[i][j]) == False: # check if there is a rating
            i_loc = np.where(ratings.keys() == i)[0][0] # locating the anime index in the ratings matrix
            j_loc = np.where(ratings[i].keys() == j)[0][0] # locating the user index in the ratings matrix

            # Calculating the error
            e = e + pow(ratings[i][j] - np.dot(users[j_loc,:],genres.iloc[i_loc]), 2)
            
            # summing the error
            for k in range(K):
              e = e + (beta/2) * (pow(users[j_loc][k],2) + pow(genres.iloc[i_loc][k],2))
      
      # stopping threshold
      if e < 0.001:

        break

    return users, genres.T

In [9]:
num_user = len(ratings_matrix)
num_anime = len(ratings_matrix[1])

k = encoded_genres.shape[1] # Number of genres

users = np.zeros((num_user,k)) # User feature matrix

nP, nQ = matrix_factorization(ratings_matrix, users, encoded_new, k, steps = 10)

KeyboardInterrupt: 

In [None]:
nR = np.dot(nP, nQ)