Book Recommendation System - Matrix Factorization Training

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set the project path
PROJECT_PATH = "/content/drive/MyDrive/book_recommendation_project"
%cd {PROJECT_PATH}

/content/drive/MyDrive/book_recommendation_project


In [3]:
# Install required packages
!pip install torch pandas scipy scikit-learn tqdm
!apt-get install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (74.4 kB/s)
Selecting previously unselected package tree.
(Reading database ... 123597 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [4]:
# Display the project structure
!tree -L 2

[01;34m.[0m
├── [00mconfig.py[0m
├── [01;34mdata[0m
│   └── [01;34mprocessed[0m
├── [00mmatrix_factor.ipynb[0m
├── [01;34m__pycache__[0m
│   └── [00mconfig.cpython-310.pyc[0m
└── [01;34msrc[0m
    ├── [01;34mdata[0m
    ├── [01;34mevaluation[0m
    ├── [00m__init__.py[0m
    ├── [00mmatrix_factor_train.py[0m
    ├── [01;34mmodels[0m
    └── [00mneural_collab_train.py[0m

7 directories, 6 files


In [5]:
# Modify config.py to use the correct paths
import os

with open('config.py', 'r') as file:
    config_content = file.read()

# Replace the ROOT_DIR definition
new_root_dir = f"ROOT_DIR = '{PROJECT_PATH}'"
config_content = config_content.replace("ROOT_DIR = os.path.dirname(os.path.abspath(__file__))", new_root_dir)

with open('config.py', 'w') as file:
    file.write(config_content)

In [6]:
# Verify the changes
!cat config.py

import os

# Project root directory
ROOT_DIR = '/content/drive/MyDrive/book_recommendation_project'

# Data directories
RAW_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'raw')
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# Data file paths
BOOKS_RATING_FILE = os.path.join(RAW_DATA_DIR, 'Books_rating.csv')
BOOKS_DATA_FILE = os.path.join(RAW_DATA_DIR, 'books_data.csv')

# Processed data file paths
TRAIN_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, 'train_data.csv')
TEST_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, 'test_data.csv')
USER_ITEM_MATRIX_FILE = os.path.join(PROCESSED_DATA_DIR, 'user_item_matrix.npz')

# New paths for encoder files
USER_ENCODER_FILE = os.path.join(PROCESSED_DATA_DIR, 'user_encoder.pkl')
BOOK_ENCODER_FILE = os.path.join(PROCESSED_DATA_DIR, 'book_encoder.pkl')

# Model parameters
EMBEDDING_DIM = 100
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 10

# Random seed for reproducibility
RANDOM_SEED = 42

# Evaluation metrics
TOP_K = 10  # For prec

In [7]:
# Run the training script
!python src/matrix_factor_train.py

Starting data loading...
Loading data...
Data loaded successfully.
Data loaded. Train shape: (600000, 21), Test shape: (150000, 21)
Number of users: 1008973, Number of items: 221998
Preparing tensors...
Tensors prepared.
Datasets created.
DataLoaders created.
Using device: cuda
Initializing model...
Model initialized.
Starting training loop...
Starting epoch 1/10
Training batch 1/9375
Training batch 101/9375
Training batch 201/9375
Training batch 301/9375
Training batch 401/9375
Training batch 501/9375
Training batch 601/9375
Training batch 701/9375
Training batch 801/9375
Training batch 901/9375
Training batch 1001/9375
Training batch 1101/9375
Training batch 1201/9375
Training batch 1301/9375
Training batch 1401/9375
Training batch 1501/9375
Training batch 1601/9375
Training batch 1701/9375
Training batch 1801/9375
Training batch 1901/9375
Training batch 2001/9375
Training batch 2101/9375
Training batch 2201/9375
Training batch 2301/9375
Training batch 2401/9375
Training batch 2501/9