In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [2]:
REPO = "https://github.com/HenryNVP/anime_recsys.git"
!git clone $REPO

import os
%cd anime_recsys

!pip -q install -r requirements.txt

Cloning into 'anime_recsys'...
remote: Enumerating objects: 119, done.[K
remote: Counting objects: 100% (119/119), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 119 (delta 41), reused 54 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (119/119), 60.35 KiB | 3.77 MiB/s, done.
Resolving deltas: 100% (41/41), done.
/content/anime_recsys


In [None]:
import sys, json, shutil, pickle, numpy as np, pandas as pd

PROJECT_ROOT = os.getcwd()  # adjust if needed
DATA_RAW   = os.path.join(PROJECT_ROOT, "data_raw")
DATA_CLEAN = os.path.join(PROJECT_ROOT, "data_clean")
OUTPUTS    = os.path.join(PROJECT_ROOT, "outputs")

for p in [DATA_RAW, DATA_CLEAN, OUTPUTS]:
    os.makedirs(p, exist_ok=True)

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Project root:", PROJECT_ROOT)
print("Paths:", {"data_raw": DATA_RAW, "data_clean": DATA_CLEAN, "outputs": OUTPUTS})

Project root: /content/anime_recsys
Paths: {'data_raw': '/content/anime_recsys/data_raw', 'data_clean': '/content/anime_recsys/data_clean', 'outputs': '/content/anime_recsys/outputs'}


In [9]:
from google.colab import files
uploaded = files.upload()   # select anime.csv and rating.csv

shutil.move("anime.csv", "data_raw/anime.csv")
shutil.move("rating.csv", "data_raw/rating.csv")

Saving anime.csv to anime (4).csv
Saving rating.csv to rating (4).csv


'data_raw/rating.csv'

In [10]:
# Preprocess to data_clean/ (70/15/15)
!python scripts/preprocess.py --raw_dir data_raw --clean_dir data_clean --min_user_inter 5 --min_item_inter 5 --seed 42 --train_ratio 0.70 --val_ratio 0.15


=== Preprocess Summary ===
Raw ratings:      1,677
Clean ratings:    1,655
Users (kept):     40
Items (kept):     199
Clean folder:     data_clean
- train.parquet   1,144
- val.parquet     229
- test.parquet    282
Files: anime.csv, rating.csv, mappings.pkl, stats.json
Done ✅


In [11]:
print(json.dumps(json.load(open(os.path.join(DATA_CLEAN, "stats.json"))), indent=2)[:800])
pd.read_parquet(os.path.join(DATA_CLEAN, "train.parquet")).head()

{
  "raw_interactions": 1677,
  "clean_interactions": 1655,
  "num_users": 40,
  "num_items": 199,
  "min_user_inter": 5,
  "min_item_inter": 5,
  "train_ratio": 0.7,
  "val_ratio": 0.15,
  "test_ratio": 0.15000000000000005,
  "seed": 42,
  "notes": "episodes coerced to numeric (NaN kept), duplicates removed"
}


Unnamed: 0,u,i
0,0,29
1,0,34
2,0,44
3,0,23
4,0,26


## Basline models

### Popularity model

In [12]:
# Preprocess data and run baseline popularity model
!python -m recsys.train --trainer popularity --data_dir data_clean --outputs outputs

✅ Saved popularity counts to outputs/popularity.npy


### Item-based colaborative filtering

In [14]:
# Train itemknn model
!python -m recsys.train --trainer itemknn --data_dir data_clean --outputs outputs --max_neighbors 200

ItemKNN (cosine, top-N):   0% 0/1 [00:00<?, ?it/s]ItemKNN (cosine, top-N): 100% 1/1 [00:00<00:00, 51.82it/s]
✅ Saved ItemKNN to outputs/itemknn_topk.npz and ui_csr.npz


In [None]:
# Sweep k neighbors and plot (val)
!python -m scripts.tune_itemknn --data_dir data_clean --outputs outputs --split val --eval_k 10 --max_k 200 --step 10



use_k_neighbors=10
Eval ItemKNN (val):   0% 0/229 [00:00<?, ?it/s]Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 4870.35it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=20
Eval ItemKNN (val):   0% 0/229 [00:00<?, ?it/s]Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 4960.49it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=30
Eval ItemKNN (val):   0% 0/229 [00:00<?, ?it/s]Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 4804.16it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=40
Eval ItemKNN (val):   0% 0/229 [00:00<?, ?it/s]Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 5023.57it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=50
Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 4947.13it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=60
Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 5012.37it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=70
Eval ItemKNN (val): 100% 229/229 [00:00<00:00, 5065.37it/s]
HR@10=0.1310  NDCG@10=0.0564

use_k_neighbors=80
Eval ItemKNN (val): 1

In [24]:
df = pd.read_csv("itemknn_tuning.csv")
best_k = int(df.sort_values(by="NDCG@10", ascending=False).iloc[0]["use_k"])
best_k

10

In [None]:
# Test ItemKNN with best_k
!python -m recsys.eval --model itemknn --outputs outputs --split test --k 10 --use_k_neighbors {best_k}

Eval ItemKNN (test):   0% 0/282 [00:00<?, ?it/s]Eval ItemKNN (test): 100% 282/282 [00:00<00:00, 4928.12it/s]
HR@10=0.1702  NDCG@10=0.0797


## NeuMF

In [None]:
# Train NeuMF
!python -m recsys.train --trainer neumf --data_dir data_clean --outputs outputs --epochs 2 --batch_size 131072 --neg_k 4 --emb_gmf 64 --emb_mlp 64 --mlp_layers 256,128,64 --patience 2 --k_eval 10

Build train pairs: 100% 1144/1144 [00:00<00:00, 15665.92it/s]
[NeuMF] epoch 1/2: 100% 1/1 [00:00<00:00,  1.38it/s]
  train_bce=0.6927  val HR@10=0.0786  NDCG@10=0.0332
  ✅ Saved best NeuMF
[NeuMF] epoch 2/2: 100% 1/1 [00:00<00:00,  3.24it/s]
  train_bce=0.6774  val HR@10=0.1048  NDCG@10=0.0453
  ✅ Saved best NeuMF


In [None]:
# Eval NeuMF on val and test
# Pass the same architecture parameters as used during training
!python -m recsys.eval --model neumf --outputs outputs --data_dir data_clean --split val --k 10 --emb_gmf 64 --emb_mlp 64 --mlp_layers 256,128,64
!python -m recsys.eval --model neumf --outputs outputs --data_dir data_clean --split test --k 10 --emb_gmf 64 --emb_mlp 64 --mlp_layers 256,128,64

Eval NeuMF (val): 100% 229/229 [00:00<00:00, 1162.17it/s]
HR@10=0.1048  NDCG@10=0.0453
Eval NeuMF (test): 100% 282/282 [00:00<00:00, 1298.46it/s]
HR@10=0.1596  NDCG@10=0.0859


In [None]:
# Train Hybrid NeuMF (optional)
!python -m recsys.train --trainer hybrid --data_dir data_clean --outputs outputs --epochs 2 --batch_size 131072 --neg_k 4 --emb_gmf 64 --emb_mlp 64 --mlp_layers 256,128,64 --patience 2 --k_eval 10


Build train pairs: 100% 1144/1144 [00:00<00:00, 15709.21it/s]
[HybridNeuMF] epoch 1/2: 100% 1/1 [00:00<00:00,  1.38it/s]
  train_bce=0.7373  val HR@10=0.1092  NDCG@10=0.0566
  ✅ Saved best HybridNeuMF
[HybridNeuMF] epoch 2/2: 100% 1/1 [00:00<00:00,  3.16it/s]
  train_bce=0.6236  val HR@10=0.1092  NDCG@10=0.0555


In [None]:
# Eval Hybrid NeuMF
!python -m recsys.eval --model hybrid --outputs outputs --split val --k 10
!python -m recsys.eval --model hybrid --outputs outputs --split test --k 10


Eval HybridNeuMF (val): 100% 229/229 [00:00<00:00, 1025.18it/s]
HR@10=0.1092  NDCG@10=0.0566
Eval HybridNeuMF (test): 100% 282/282 [00:00<00:00, 1165.05it/s]
HR@10=0.0993  NDCG@10=0.0471


In [None]:
# NeuMF tuning sweep (small grid; early stopping on val NDCG@10)
!python -m scripts.tune_neumf_hybrid \
  --data_dir data_clean --outputs outputs \
  --epochs 8 --patience 2 --k_eval 10 \
  --emb_gmf_grid 32,64 \
  --emb_mlp_grid 32,64 \
  --mlp_grid 256-128-64,512-256-128 \
  --lr_grid 0.003,0.001 \
  --negk_grid 2,4


Total configs: 32
Build pairs (neg_k=2): 100% 1144/1144 [00:00<00:00, 14524.80it/s]
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 1/8: 100% 1/1 [00:00<00:00,  1.96it/s]
  ✅ improved; saved best
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 2/8: 100% 1/1 [00:00<00:00,  9.77it/s]
  ✅ improved; saved best
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 3/8: 100% 1/1 [00:00<00:00, 10.08it/s]
  ✅ improved; saved best
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 4/8: 100% 1/1 [00:00<00:00,  9.59it/s]
  ✅ improved; saved best
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 5/8: 100% 1/1 [00:00<00:00, 10.31it/s]
  ✅ improved; saved best
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 6/8: 100% 1/1 [00:00<00:00,  9.56it/s]
[gmf32_mlp32_layers256-128-64_lr0.003_neg2] epoch 7/8: 100% 1/1 [00:00<00:00, 10.06it/s]
⏹ early stopping (patience=2)
Build pairs (neg_k=4): 100% 1144/1144 [00:00<00:00, 15155.62it/s]
[gmf32_mlp32_layers256-128-64_lr0.003_neg4] epoch 1/8: 100% 1/1 [00:00<0

## Compare all

In [45]:
#@title Compare popularity, itemknn(best_k), neumf (and hybrid if trained)
!python -m scripts.compare_all --outputs outputs --Ks 5,10,20 --use_k_neighbors {best_k}



=== K=5 on test ===
HR@5=0.0922  NDCG@5=0.0680
Eval ItemKNN (test):   0% 0/282 [00:00<?, ?it/s]Eval ItemKNN (test): 100% 282/282 [00:00<00:00, 5295.27it/s]
HR@5=0.0816  NDCG@5=0.0515
Eval NeuMF (test): 100% 282/282 [00:00<00:00, 1230.65it/s]
HR@5=0.0851  NDCG@5=0.0552
Eval HybridNeuMF (test): 100% 282/282 [00:00<00:00, 1777.08it/s]
HR@5=0.0567  NDCG@5=0.0336

=== K=10 on test ===
HR@10=0.1667  NDCG@10=0.0921
Eval ItemKNN (test): 100% 282/282 [00:00<00:00, 5044.18it/s]
HR@10=0.1702  NDCG@10=0.0797
Eval NeuMF (test): 100% 282/282 [00:00<00:00, 2163.57it/s]
HR@10=0.1418  NDCG@10=0.0740
Eval HybridNeuMF (test): 100% 282/282 [00:00<00:00, 1889.37it/s]
HR@10=0.0993  NDCG@10=0.0471

=== K=20 on test ===
HR@20=0.2447  NDCG@20=0.1118
Eval ItemKNN (test): 100% 282/282 [00:00<00:00, 5165.62it/s]
HR@20=0.2695  NDCG@20=0.1048
Eval NeuMF (test): 100% 282/282 [00:00<00:00, 2151.18it/s]
HR@20=0.2340  NDCG@20=0.0972
Eval HybridNeuMF (test): 100% 282/282 [00:00<00:00, 1897.87it/s]
HR@20=0.1702  NDCG@

In [62]:
!git pull

Already up to date.


In [63]:
!git status

On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [64]:
!git remote -v

origin	https://github.com/HenryNVP/anime_recsys.git (fetch)
origin	https://github.com/HenryNVP/anime_recsys.git (push)


In [76]:
# Prompt the user to enter their GitHub token when the cell is run, with masked input.
# NOTE: Entering your token directly in an input field might not be as secure as using Colab Secrets.

import os
import getpass

repo_url = "https://github.com/HenryNVP/anime_recsys.git"
username = "HenryNVP" # Your GitHub username

# Ask the user to enter their GitHub token with masked input
token = getpass.getpass("Please enter your GitHub personal access token: ")

if token:
    # Construct the authenticated remote URL using the token
    authenticated_repo_url = f"https://{username}:{token}@github.com/HenryNVP/anime_recsys.git"

    # Ensure there are changes in the specified folders before committing
    !git add notebooks recsys scripts
    # Check if there are staged changes before committing
    commit_output = !git commit -m "colab run" 2>&1
    print("\n".join(commit_output))

    # Check if the commit was successful (or if there was nothing to commit)
    if "nothing to commit" in "\n".join(commit_output) or "files changed" in "\n".join(commit_output):
        # Use the authenticated URL for pushing
        push_output = !git push $authenticated_repo_url main 2>&1
        print("\n".join(push_output))
        # Check push output for success message
        if "To" in "\n".join(push_output) and "main" in "\n".join(push_output):
             print("\n✅ Push successful!")
        else:
             print("\n❌ Push failed.")
    else:
        print("Commit failed. Skipping push.")
else:
    print("No GitHub token entered. Skipping push.")

Please enter your GitHub personal access token: ··········
[main 32aa4be] colab run
 21 files changed, 11338 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 LICENSE
 delete mode 100644 README.md
 delete mode 100644 anime (1).csv
 delete mode 100644 anime (2).csv
 delete mode 100644 anime (3).csv
 delete mode 100644 anime (4).csv
 delete mode 100644 data_clean/anime.csv
 delete mode 100644 data_clean/rating.csv
 delete mode 100644 data_clean/stats.json
 delete mode 100644 data_clean/test.parquet
 delete mode 100644 data_clean/train.parquet
 delete mode 100644 data_clean/val.parquet
 delete mode 100644 data_raw/anime.csv
 delete mode 100644 data_raw/rating.csv
 delete mode 100644 itemknn_tuning.csv
 delete mode 100644 rating (1).csv
 delete mode 100644 rating (2).csv
 delete mode 100644 rating (3).csv
 delete mode 100644 rating (4).csv
 delete mode 100644 requirements.txt
Enumerating objects: 3, done.
Counting objects:  33% (1/3)
Counting objects:  66% (2/3)
Counting obje

In [67]:
!pwd

/content/anime_recsys


In [71]:

with open(".gitignore", "w") as f:
    f.write("""\
# python
__pycache__/
*.pyc
.venv/
.env


# data & artifacts
/data/
/data_mini/
/checkpoints/
/outputs/
/output_mini/
/results/
/scripts/make_mini.py
/kaggle.json
*.pt
*.pkl
*.npz
*.npy


# notebooks
.ipynb_checkpoints/
""")

In [73]:
!ls

'anime (1).csv'   data_raw	      'rating (1).csv'	 recsys
'anime (2).csv'   itemknn_tuning.csv  'rating (2).csv'	 requirements.txt
'anime (3).csv'   LICENSE	      'rating (3).csv'	 scripts
'anime (4).csv'   notebooks	      'rating (4).csv'
 data_clean	  outputs	       README.md


In [75]:
!git rm -r --cached .


rm '.gitignore'
rm 'LICENSE'
rm 'README.md'
rm 'anime (1).csv'
rm 'anime (2).csv'
rm 'anime (3).csv'
rm 'anime (4).csv'
rm 'data_clean/anime.csv'
rm 'data_clean/rating.csv'
rm 'data_clean/stats.json'
rm 'data_clean/test.parquet'
rm 'data_clean/train.parquet'
rm 'data_clean/val.parquet'
rm 'data_raw/anime.csv'
rm 'data_raw/rating.csv'
rm 'itemknn_tuning.csv'
rm 'notebooks/anime_recsys.ipynb'
rm 'notebooks/eda.ipynb'
rm 'rating (1).csv'
rm 'rating (2).csv'
rm 'rating (3).csv'
rm 'rating (4).csv'
rm 'recsys/data.py'
rm 'recsys/eval.py'
rm 'recsys/features.py'
rm 'recsys/metrics.py'
rm 'recsys/models/hybrid_neumf.py'
rm 'recsys/models/itemknn.py'
rm 'recsys/models/knn.py'
rm 'recsys/models/neumf.py'
rm 'recsys/models/popularity.py'
rm 'recsys/train.py'
rm 'requirements.txt'
rm 'scripts/check_features.py'
rm 'scripts/compare_all.py'
rm 'scripts/preprocess.py'
rm 'scripts/tune_itemknn.py'
rm 'scripts/tune_neumf_hybrid.py'


In [78]:
import shutil
import os
from google.colab import files

# Define the folders to archive
folders_to_archive = ["notebooks", "recsys", "scripts"]
archive_name = "project_folders"

# Create a zip archive of the specified folders
# Corrected the arguments for make_archive
shutil.make_archive(archive_name, 'zip', root_dir=os.getcwd(), base_dir=None)

# Download the zip archive
files.download(f'{archive_name}.zip')

print(f"✅ Downloaded '{archive_name}.zip' containing the folders: {', '.join(folders_to_archive)}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Downloaded 'project_folders.zip' containing the folders: notebooks, recsys, scripts
