fastFMでアニメのレコメンドシステムを実装

dataset:
https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database?resource=download

In [2]:
pip install fastFM



In [3]:
pip install category_encoders



In [5]:
pwd

'/content'

In [4]:
import numpy as np
import pandas as pd

df_rating = pd.read_csv("./sample_data/rating.csv")
df_label = pd.read_csv(r"./sample_data/anime.csv")

In [5]:
df_rating
df_label

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [6]:
# 量が多いので500000件以上レビューがある人気アニメに絞る
df_label =df_label[df_label['members']>500000]

In [7]:
# anime_idをキーにして内部結合する
df = pd.merge(df_rating, df_label, on='anime_id', how='inner')[['user_id','anime_id','name','rating_x']]


In [8]:
import category_encoders as ce
from sklearn.model_selection import train_test_split

# ユーザーの視聴履歴を集約
user_data = df.pivot_table(index="user_id",columns="name",values="rating_x")

#ユーザーの評価履歴に過去の評価履歴を内部結合(メモリをかなり食うので10000件だけ)
test_df = pd.merge(df_rating.iloc[:10000,:], user_data, on='user_id', how='inner')

# ユーザーIDの列を削除して欠損値を0埋め
test_df = test_df.drop('user_id',axis=1).fillna(0)

data_y = test_df['rating']
data_X = test_df.drop(['rating'], axis=1)

In [9]:
# 指定の列をone-hotエンコード
enc = ce.OneHotEncoder(cols=['anime_id'])
X = enc.fit_transform(data_X)
y = data_y

# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [10]:
from fastFM import als
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

# モデル作成
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=8, l2_reg_w=0.5, l2_reg_V=0.5, random_state=1)
fm.fit(csr_matrix(X_train), y_train)

# 評価

## 学習データへの当てはまり具合
train_loss = mean_absolute_error(y_train, fm.predict(csr_matrix(X_train)))

## テストデータの誤差
test_loss = mean_absolute_error(y_test, fm.predict(csr_matrix(X_test)))