In [1]:
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import csr_matrix
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
anime_data_cleaned = pd.read_csv('csv/2020/anime_2020_cleaned.csv')
rating_data = pd.read_csv('csv/2020/rating_2020.csv')

In [21]:
anime_name = pd.read_csv('csv/2020/anime_2020_name.csv')

In [3]:
counts = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(counts[counts >= 200].index)]
rating_data.rename(columns={'anime_id':'MAL_ID'}, inplace=True)

In [4]:
# exclude anime from rating data if the id is not in anime_data_cleaned
rating_data = rating_data[rating_data['MAL_ID'].isin(anime_data_cleaned['MAL_ID'])]

In [5]:
anime_rating_data = anime_data_cleaned.merge(rating_data, on='MAL_ID', how='inner')

## SVD collaborative filtering 

In [6]:
anime_ratings = anime_rating_data[['user_id', 'MAL_ID', 'rating']]

### Split data into train and test

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets based on the users
train_data, test_data = train_test_split(rating_data, test_size=0.2, random_state=42)

In [8]:
from scipy.sparse import csr_matrix
user_item_matrix = train_data.pivot_table(index='user_id', columns='MAL_ID', values='rating').fillna(0)
train_data_sparse = csr_matrix(user_item_matrix.fillna(0).values)

In [None]:
%pip install scikit-surprise

In [11]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

### Predict rating funtion

In [26]:
def predict_ratings_svd(user_id, svd_algo):
    predicted_ratings = []
    anime_ids = test_data.loc[test_data['user_id'] == user_id]['MAL_ID'].values
    for anime_id in anime_ids:
        prediction = svd_algo.predict(user_id, anime_id)
        predicted_ratings.append(prediction.est)
    return predicted_ratings

In [13]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data[['user_id', 'MAL_ID', 'rating']], reader)

### Fitting SVD Model

In [14]:
# svd_algo = SVD()
# svd_algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16fe8d520>

Save model

In [None]:
import pickle

In [33]:
# with open('svd_algo.pkl', 'wb') as file:
#     pickle.dump(svd_algo, file)

Load model

In [34]:
with open('models/svd_algo.pkl', 'rb') as file:
    loaded_svd_algo = pickle.load(file)

### Predict ratings with SVD

In [17]:
user_id = test_data['user_id'].sample(1).values[0]
user_id

342921

In [35]:
predicted_ratings = predict_ratings_svd(user_id, loaded_svd_algo)
predicted_ratings

[6.8122146082050055,
 8.138606986311284,
 7.348231835924189,
 9.01550361292686,
 9.621289597597505,
 8.549552609433622,
 7.76763416648424,
 7.851417643733848,
 7.352412167221387,
 7.58355467681487,
 8.199906781283664,
 6.742911811081626,
 9.041983516243471,
 5.375859867800789,
 9.193915970481903,
 7.190840992810774,
 6.811445743626689,
 6.792720081885085,
 5.252062695171045,
 8.132641953138261,
 7.991117904070524,
 6.543084230342891,
 7.354125557116518,
 8.678595838638751,
 7.36015079992048,
 8.259671619451444,
 5.444265530892357,
 8.875389654687812,
 6.294931631528147,
 4.2925777940090395,
 8.21385442854357,
 8.335699834830455,
 8.206733608207054,
 7.455093992602238,
 6.739850498231444,
 7.7681086285594265,
 8.373065503133786,
 8.302529972177602,
 7.600018129110988,
 7.731943796785893,
 7.95154958382408,
 8.103076356772211,
 8.624899420015822,
 8.948434584136164,
 5.358074042076838,
 7.544170121506271,
 6.4416915936595,
 6.74756792120563,
 8.205973838012277,
 7.356321478675884,
 8.476

In [28]:
len(predicted_ratings)

137

In [29]:
actual_ratings = test_data.loc[test_data['user_id']==user_id]['rating'].values
actual_ratings

array([10, 10,  9,  9,  9,  8,  8,  7,  7,  8,  8,  6,  9, 10,  6,  7,  4,
        6,  4, 10,  6,  7,  5,  9,  8, 10,  6, 10,  7,  4,  7,  9,  7,  7,
        7,  9, 10, 10,  6,  7, 10, 10,  8,  7,  4,  7, 10,  8, 10,  6,  9,
        7, 10,  8,  8,  9,  6,  7,  7,  8,  4,  5,  8,  8,  8,  5,  7,  6,
        7,  7,  5,  8, 10,  6,  4,  5,  8,  8,  6, 10,  7,  8,  9,  9,  5,
        9,  8,  6,  6,  4,  6,  6,  8,  7,  7,  6,  8,  8,  6,  3,  8,  7,
        8,  5,  5,  8,  7,  9,  6,  8,  6,  7,  7,  4,  8,  8,  8,  8,  8,
        7,  5,  8,  7,  9,  7,  8, 10, 10, 10, 10,  8,  7,  7, 10,  7,  8,
        7])

In [30]:
actual_ratings.shape

(137,)

### Calculate MAE

In [31]:
def calculate_mae(predicted_ratings, actual_ratings):
    return np.mean(np.abs(predicted_ratings - actual_ratings))

In [32]:
calculate_mae(predicted_ratings, actual_ratings)

1.1394314274006974

In [36]:
def predict_single_rating_svd(user_id, anime_id, svd_algo):
    prediction = svd_algo.predict(user_id, anime_id)
    predicted_rating = prediction.est
    return predicted_rating

In [37]:
def compute_mae_random_samples(data, user_item_matrix, n_samples=100, svd_algo=loaded_svd_algo):
    mae = 0

    for _ in range(n_samples):
        # Randomly sample a user_id and anime_id pair from the data
        sample = data.sample()
        user_id = sample['user_id'].values[0]
        anime_id = sample['MAL_ID'].values[0]

        # Make sure the user has rated the anime
        while user_item_matrix.loc[user_id, anime_id] == 0:
            sample = data.sample()
            user_id = sample['user_id'].values[0]
            anime_id = sample['MAL_ID'].values[0]

        # Predict the rating using the modified function
        predicted_rating = predict_single_rating_svd(user_id, anime_id, svd_algo)

        # Get the actual rating from the sampled data
        actual_rating = sample['rating'].values[0]

        # Update the mean absolute error
        mae += abs(predicted_rating - actual_rating)

    # Calculate the mean absolute error
    mae /= n_samples

    return mae

In [38]:
mae = compute_mae_random_samples(rating_data, user_item_matrix, n_samples=100)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.5708898630526883
