In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [2]:
REPO = "https://github.com/HenryNVP/anime_recsys.git"
!git clone $REPO

import os
%cd anime_recsys

!pip -q install -r requirements.txt

Cloning into 'anime_recsys'...
remote: Enumerating objects: 156, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 156 (delta 57), reused 100 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (156/156), 105.10 KiB | 15.01 MiB/s, done.
Resolving deltas: 100% (57/57), done.
/content/anime_recsys


In [4]:
import os, sys, json, pickle, numpy as np, pandas as pd

PROJECT_ROOT = os.getcwd()  # adjust if needed
DATA_RAW   = os.path.join(PROJECT_ROOT, "data_raw")
DATA_CLEAN = os.path.join(PROJECT_ROOT, "data_clean")
OUTPUTS    = os.path.join(PROJECT_ROOT, "outputs")

for p in [DATA_RAW, DATA_CLEAN, OUTPUTS]:
    os.makedirs(p, exist_ok=True)

# ensure package import works (recsys/)
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Project root:", PROJECT_ROOT)
print("Paths:", {"data_raw": DATA_RAW, "data_clean": DATA_CLEAN, "outputs": OUTPUTS})

Project root: /content/anime_recsys
Paths: {'data_raw': '/content/anime_recsys/data_raw', 'data_clean': '/content/anime_recsys/data_clean', 'outputs': '/content/anime_recsys/outputs'}


In [None]:
# Upload anime.csv, rating.csv
os.makedirs("data_raw", exist_ok=True)

from google.colab import files
print("Upload anime.csv and rating.csv")
uploaded = files.upload()
for name in uploaded.keys():
    if name.endswith(".csv"):
        !mv $name data/
!ls -lh data || true

In [None]:
# Preprocess to data_clean/ (70/15/15)
!python scripts/preprocess.py --raw_dir data_raw --clean_dir data_clean --min_user_inter 5 --min_item_inter 5 --seed 42 --train_ratio 0.70 --val_ratio 0.15

In [None]:
print(json.dumps(json.load(open(os.path.join(DATA_CLEAN, "stats.json"))), indent=2)[:800])
pd.read_parquet(os.path.join(DATA_CLEAN, "train.parquet")).head()

## Basline models

### Popularity model

In [None]:
# Preprocess data and run baseline popularity model
!python -m recsys.train --trainer popularity --data_dir data_clean --outputs outputs

### Item-based colaborative filtering

In [None]:
# Train itemknn model
!python -m recsys.train --trainer itemknn --data_dir data_clean --outputs outputs --max_neighbors 200

In [None]:
#@title Sweep k neighbors and plot (val)
!python scripts/tune_itemknn.py --outputs outputs --split val --eval_k 10 --max_k 200 --step 10 --csv_out itemknn_tuning.csv

In [None]:
df = pd.read_csv("itemknn_tuning.csv")
best_k = int(df.sort_values(by="NDCG@10", ascending=False).iloc[0]["use_k"])
best_k

In [None]:
#@title Test ItemKNN with best_k
!python -m recsys.eval --model itemknn --outputs outputs --split test --k 10 --use_k_neighbors {best_k}

## NeuMF

In [None]:
#@title Train NeuMF
!python -m recsys.train --trainer neumf --data_dir data_clean --outputs outputs --epochs 8 --batch_size 131072 --neg_k 4 --emb_gmf 64 --emb_mlp 64 --mlp_layers 256,128,64 --patience 2 --k_eval 10

In [None]:
#@title Eval NeuMF on val and test
!python -m recsys.eval --model neumf --outputs outputs --split val --k 10
!python -m recsys.eval --model neumf --outputs outputs --split test --k 10

In [None]:
#@title Train Hybrid NeuMF (optional)
!python -m recsys.train --trainer hybrid --data_dir data_clean --outputs outputs --epochs 8 --batch_size 131072 --neg_k 4 --emb_gmf 32 --emb_mlp 32 --mlp_layers 256,128,64 --patience 2 --k_eval 10


In [None]:
#@title Eval Hybrid NeuMF
!python -m recsys.eval --model hybrid --outputs outputs --split val --k 10
!python -m recsys.eval --model hybrid --outputs outputs --split test --k 10


In [None]:
#@title NeuMF tuning sweep (small grid; early stopping on val NDCG@10)
!python scripts/tune_neumf.py \
  --data_dir data_clean --outputs outputs \
  --epochs 8 --patience 2 --k_eval 10 \
  --emb_gmf_grid 32,64 \
  --emb_mlp_grid 32,64 \
  --mlp_grid 256-128-64,512-256-128 \
  --lr_grid 0.003,0.001 \
  --negk_grid 2,4


## Compare all

In [None]:
#@title Compare popularity, itemknn(best_k), neumf (and hybrid if trained)
!python scripts/compare_all.py --outputs outputs --Ks 5,10,20 --use_k_neighbors {best_k}
