Book Recommendation System - Neural Collaborative Filtering (NCF) Training

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set the project path
PROJECT_PATH = "/content/drive/MyDrive/book_recommendation_project"
%cd {PROJECT_PATH}

/content/drive/MyDrive/book_recommendation_project


In [3]:
# Install required packages
!pip install torch pandas scipy scikit-learn tqdm
!apt-get install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (83.1 kB/s)
Selecting previously unselected package tree.
(Reading database ... 123597 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [4]:
# Display the project structure
!tree -L 2

[01;34m.[0m
├── [00mbest_ncf_model.pth[0m
├── [00mconfig.py[0m
├── [01;34mdata[0m
│   └── [01;34mprocessed[0m
├── [00mmatrix_factor.ipynb[0m
├── [00mneural_collab.ipynb[0m
├── [01;34m__pycache__[0m
│   └── [00mconfig.cpython-310.pyc[0m
└── [01;34msrc[0m
    ├── [01;34mdata[0m
    ├── [01;34mevaluation[0m
    ├── [00m__init__.py[0m
    ├── [00mmatrix_factor_train.py[0m
    ├── [01;34mmodels[0m
    ├── [00mneural_collab_train.py[0m
    └── [01;34m__pycache__[0m

8 directories, 8 files


In [5]:
# Modify config.py to use the correct paths
import os

with open('config.py', 'r') as file:
    config_content = file.read()

# Replace the ROOT_DIR definition
new_root_dir = f"ROOT_DIR = '{PROJECT_PATH}'"
config_content = config_content.replace("ROOT_DIR = os.path.dirname(os.path.abspath(__file__))", new_root_dir)

with open('config.py', 'w') as file:
    file.write(config_content)

In [6]:
# Verify the changes
!cat config.py

import os

# Project root directory
ROOT_DIR = '/content/drive/MyDrive/book_recommendation_project'

# Data directories
RAW_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'raw')
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# Data file paths
BOOKS_RATING_FILE = os.path.join(RAW_DATA_DIR, 'Books_rating.csv')
BOOKS_DATA_FILE = os.path.join(RAW_DATA_DIR, 'books_data.csv')

# Processed data file paths
TRAIN_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, 'train_data.csv')
TEST_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, 'test_data.csv')
USER_ITEM_MATRIX_FILE = os.path.join(PROCESSED_DATA_DIR, 'user_item_matrix.npz')

# New paths for encoder files
USER_ENCODER_FILE = os.path.join(PROCESSED_DATA_DIR, 'user_encoder.pkl')
BOOK_ENCODER_FILE = os.path.join(PROCESSED_DATA_DIR, 'book_encoder.pkl')

# Model parameters
EMBEDDING_DIM = 100
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 10

# Random seed for reproducibility
RANDOM_SEED = 42

# Evaluation metrics
TOP_K = 10  # For prec

In [7]:
from src.neural_collab_train import main as train_model

# Run the training and get the model
trained_model = train_model()

Loading data...
Data loaded successfully.
Number of users: 1008973, Number of items: 221998
Preparing tensors...
Tensors prepared.
Datasets created.
DataLoaders created.
Using device: cuda
Initializing model...
Model initialized.
Starting training loop...
Epoch [1/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 1.4144
Evaluating...




Evaluation metrics:
MSE: 1.3660
RMSE: 1.1688
MAE: 0.8937
NDCG@10: 1.0000
Saving best model...

Epoch [2/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 1.2575
Evaluating...




Evaluation metrics:
MSE: 1.3415
RMSE: 1.1582
MAE: 0.8776
NDCG@10: 1.0000
Saving best model...

Epoch [3/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 1.0524
Evaluating...




Evaluation metrics:
MSE: 1.3651
RMSE: 1.1684
MAE: 0.8329
NDCG@10: 1.0000

Epoch [4/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 0.8435
Evaluating...


                                                                

Evaluation metrics:
MSE: 1.4250
RMSE: 1.1937
MAE: 0.8637
NDCG@10: 1.0000

Epoch [5/10]




Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 0.6680
Evaluating...




Evaluation metrics:
MSE: 1.4807
RMSE: 1.2168
MAE: 0.8373
NDCG@10: 1.0000

Epoch [6/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

Train Loss: 0.5359
Evaluating...




Evaluation metrics:
MSE: 1.5279
RMSE: 1.2361
MAE: 0.8649
NDCG@10: 1.0000

Epoch [7/10]


Training:   0%|          | 0/9375 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Create a models directory if it doesn't exist
models_dir = os.path.join(PROJECT_PATH, 'models')
os.makedirs(models_dir, exist_ok=True)

In [None]:
import datetime
# Save the model
model_filename = f'trained_NCF_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pth'
model_save_path = os.path.join(models_dir, model_filename)
torch.save(trained_model.model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")