In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from pmf import pmf
from lda import lda_model

Load dataset

In [None]:
anime_df = pd.read_csv('data/anime.csv')
anime_df = anime_df.rename(columns={'rating': 'avg_rating'})
rating_df = pd.read_csv('data/rating.csv')

Sample a few data points to make it easier for computation

In [None]:
rating_df = rating_df.sample(n=100_000, random_state=42)

Data for pmf

In [None]:
rating_df = rating_df[rating_df['rating'] != -1]
combined_df = pd.merge(anime_df, rating_df, on='anime_id', how='inner')

uniq_users = combined_df['user_id'].unique().tolist()
uniq_animes = combined_df['anime_id'].unique().tolist()

n_users = len(uniq_users)
n_animes = len(uniq_animes)

df_copy = combined_df.copy()
train_set = df_copy.sample(frac=0.75, random_state=0)
test_set = df_copy.drop(train_set.index)

Data for lda

In [None]:
anime_df = anime_df.rename(columns={'genre': 'description'})
used_anime_ids = set(train_set['anime_id']).union(set(test_set['anime_id']))
used_user_ids = set(train_set['user_id']).union(set(test_set['user_id']))

anime_filtered = anime_df[anime_df['anime_id'].isin(used_anime_ids) & anime_df['description'].notna()]
rating_for_lda = rating_df[
    (rating_df['anime_id'].isin(anime_filtered['anime_id'])) &
    (rating_df['user_id'].isin(used_user_ids))
]
lda_df = pd.merge(anime_filtered[['anime_id', 'description']], rating_for_lda, on='anime_id', how='inner')

lda_copy = lda_df.copy()
lda_train_set = lda_copy.sample(frac=0.75, random_state=0)
lda_test_set = lda_copy.drop(lda_train_set.index)

In [None]:
pmf_model = pmf(n_users=n_users, n_animes=n_animes, lambda_U=5, lambda_V=5, uniq_users=uniq_users, uniq_animes=uniq_animes, n_dimesions=10)

In [None]:
log_ps, rmse_train, rmse_test = pmf_model.train(train_set=train_set, test_set=test_set,n_epochs=20)

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
plt.title('Training results')
ax1.plot(np.arange(len(log_ps)), log_ps, label='MAP')
ax1.legend()

ax2.plot(np.arange(len(rmse_train)), rmse_train, label='RMSE train')
ax2.plot(np.arange(len(rmse_test)), rmse_test, label='RMSE test')
ax2.legend()

plt.show()

In [None]:
print('RMSE of training set:', pmf_model.evaluate(train_set))
print('RMSE of testing set:', pmf_model.evaluate(test_set))

In [None]:
lda_recommender = lda_model(n_topics=20, max_iter=20)

In [None]:
lda_recommender.train(train_set=lda_train_set, test_set=lda_test_set)

In [None]:
lda_rmse_train = lda_recommender.evaluate(lda_train_set)
lda_rmse_test = lda_recommender.evaluate(lda_test_set)
print('RMSE Train:', lda_rmse_train)
print('RMSE Test:', lda_rmse_test)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
x = ['Train', 'Test']
y = [lda_rmse_train, lda_rmse_test]
bars = ax.bar(x, y, color=['skyblue', 'salmon'])
ax.set_ylim(0, max(y) * 1.1)
ax.set_title("RMSE for LDA Model (Topic-based)")
ax.set_ylabel("RMSE")
ax.bar_label(bars, fmt='%.2f')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
topic_counts = [5, 10, 20, 30, 40]
train_rmse_list = []
test_rmse_list = []

for n in topic_counts:
    model = lda_model(n_topics=n, max_iter=100)
    model.train(train_set, test_set)
    train_rmse_list.append(model.evaluate(train_set))
    test_rmse_list.append(model.evaluate(test_set))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(topic_counts, train_rmse_list, label='Train RMSE')
plt.plot(topic_counts, test_rmse_list, label='Test RMSE')
plt.xlabel('Number of Topics')
plt.ylabel('RMSE')
plt.title('RMSE vs Number of Topics (LDA)')
plt.legend()
plt.grid(True)
plt.show()