## Imports

In [1]:
import os
import pandas as pd
import torch

In [2]:
# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    print(f"CUDA is available with {torch.cuda.device_count()} GPU(s).")

    # Iterate through available GPUs and print their details
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
        props = torch.cuda.get_device_properties(i)
        print(f"  Compute Capability: {props.major}.{props.minor}")
        print(f"  Total Memory: {props.total_memory / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Using CPU.")

# To get the current device PyTorch is configured to use (e.g., for tensors)
current_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nCurrent default device for tensors: {current_device}")

CUDA is available with 1 GPU(s).

GPU 0: NVIDIA GeForce GTX 1650
  Compute Capability: 7.5
  Total Memory: 4.00 GB

Current default device for tensors: cuda


## Data Exploration

In [3]:
data_dir = os.path.join("data", "books","raw")
users_dir = os.path.join(data_dir, "Users.csv")
ratings_dir = os.path.join(data_dir, "Ratings.csv")
books_dir = os.path.join(data_dir, "Books.csv")

In [4]:
df_users = pd.read_csv(users_dir)
df_ratings = pd.read_csv(ratings_dir)
df_books = pd.read_csv(books_dir, low_memory=False)

In [5]:
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
df_users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [7]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [8]:
df_users.describe()

Unnamed: 0,User-ID,Age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


In [9]:
df_users["User-ID"].nunique()

278858

In [10]:
df_users.shape

(278858, 3)

In [11]:
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [12]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [13]:
df_books.describe()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
count,271360,271360,271358,271360,271358,271360,271360,271357
unique,271360,242135,102022,118,16807,271044,271044,271041
top,195153448,Selected Poems,Agatha Christie,2002,Harlequin,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/225307649X.0...
freq,1,27,632,17627,7535,2,2,2


In [14]:
df_books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [15]:
df_books["ISBN"].nunique()

271360

In [17]:
df_books["Book-Author"].unique()

array(['Mark P. O. Morford', 'Richard Bruce Wright', "Carlo D'Este", ...,
       'David Biggs', 'Teri Sloat', 'Christopher  Biffle'],
      shape=(102023,), dtype=object)

In [18]:
df_books["Publisher"].unique()

array(['Oxford University Press', 'HarperFlamingo Canada',
       'HarperPerennial', ..., 'Tempo', 'Life Works Books', 'Connaught'],
      shape=(16808,), dtype=object)

In [20]:
df_books[df_books["Publisher"] == "Oxford University Press"].head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
357,195086295,What a Wonderful World: A Lifetime of Recordings,Bob Thiele,1995,Oxford University Press,http://images.amazon.com/images/P/0195086295.0...,http://images.amazon.com/images/P/0195086295.0...,http://images.amazon.com/images/P/0195086295.0...
397,198320264,Julius Caesar (Oxford School Shakespeare),William Shakespeare,2001,Oxford University Press,http://images.amazon.com/images/P/0198320264.0...,http://images.amazon.com/images/P/0198320264.0...,http://images.amazon.com/images/P/0198320264.0...
521,192815318,Cranford (The World's Classics),Elizabeth Gaskell,1982,Oxford University Press,http://images.amazon.com/images/P/0192815318.0...,http://images.amazon.com/images/P/0192815318.0...,http://images.amazon.com/images/P/0192815318.0...
817,198604025,How Not to Say What You Mean: A Dictionary of ...,R. W. Holder,2003,Oxford University Press,http://images.amazon.com/images/P/0198604025.0...,http://images.amazon.com/images/P/0198604025.0...,http://images.amazon.com/images/P/0198604025.0...


In [None]:
df_books.shape

(271360, 8)

In [None]:
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [None]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [None]:
df_ratings.describe()

Unnamed: 0,User-ID,Book-Rating
count,1149780.0,1149780.0
mean,140386.4,2.86695
std,80562.28,3.854184
min,2.0,0.0
25%,70345.0,0.0
50%,141010.0,0.0
75%,211028.0,7.0
max,278854.0,10.0


In [None]:
df_ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [None]:
df_ratings["User-ID"].nunique()

105283

In [None]:
df_ratings["ISBN"].nunique()

340556

In [None]:
df_ratings.shape

(1149780, 3)

## Data Cleaning

In [None]:
df_books_cleaned = df_books.dropna()

In [None]:
df_books_cleaned.shape

(271353, 8)

In [None]:
df_ratings_cleaned_user = df_ratings[(df_ratings["User-ID"] >= 1) & (df_ratings["User-ID"] <= 278858)]

In [None]:
df_ratings_cleaned_user.shape

(1149780, 3)

In [None]:

valid_isbns = set(df_books_cleaned.ISBN)
df_ratings_cleaned_books = df_ratings_cleaned_user[df_ratings_cleaned_user["ISBN"].isin(valid_isbns)]

In [None]:
df_ratings_cleaned_books.shape

(1031128, 3)

In [None]:
df_ratings_cleaned_books.reset_index(inplace=True, drop=True)

In [None]:
df_ratings_cleaned_books.head(10)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
5,276733,2080674722,0
6,276744,038550120X,7
7,276746,0425115801,0
8,276746,0449006522,0
9,276746,0553561618,0


In [None]:
df_ratings_cleaned_books["Book-Rating"].value_counts()

Book-Rating
0     647291
8      91803
10     71224
7      66401
9      60776
5      45355
6      31687
4       7617
3       5118
2       2375
1       1481
Name: count, dtype: int64

## Data Preprocessing

In [10]:
import json
import os

author_encoder_dir = os.path.join("data", "books", "processed", "author_encoder.json")
publisher_encoder_dir = os.path.join("data", "books", "processed", "publisher_encoder.json")

In [None]:
with open(author_encoder_dir) as f:
    author_tokenizer = json.load(f)

In [None]:
with open(author_encoder_dir, "w") as f:
    json.dump(author_tokenizer, f, indent=2)

In [11]:
with open(publisher_encoder_dir) as f:
    publisher_encoder = json.load(f)

In [12]:
with open(publisher_encoder_dir, "w") as f:
    json.dump(publisher_encoder, f, indent=2)

In [2]:
import os
import torch

In [3]:
data = torch.load(os.path.join("data", "books", "processed", "graph.pt"), weights_only=False)

In [4]:
data.node_items

<bound method HeteroData.node_items of HeteroData(
  user={ x=[278858, 384] },
  book={ x=[271356, 387] },
  (user, rates, book)={
    edge_index=[2, 1031132],
    edge_attr=[1031132, 1],
  },
  (book, rev_rates, user)={
    edge_index=[2, 1031132],
    edge_attr=[1031132, 1],
  }
)>

In [5]:
x_dict = {ntype: data[ntype].x for ntype in data.node_types}

In [11]:
x_dict

{'user': tensor([[ 0.1397, -0.0587,  0.0500,  ..., -0.0511, -0.0218,  0.0576],
         [ 0.1021, -0.0035, -0.0734,  ..., -0.0033,  0.0481,  0.0693],
         [ 0.0464,  0.0170, -0.0566,  ..., -0.0160, -0.0526, -0.0423],
         ...,
         [ 0.0159, -0.0044,  0.0020,  ..., -0.0326,  0.0167,  0.0328],
         [ 0.0228, -0.0321,  0.0312,  ...,  0.0540, -0.0238, -0.0140],
         [ 0.0601, -0.0011,  0.0112,  ..., -0.0253, -0.0271, -0.0007]]),
 'book': tensor([[ 1.0000e+00,  1.0000e+00,  2.0020e+03,  ...,  2.2009e-02,
           6.2922e-02,  2.6392e-02],
         [ 2.0000e+00,  2.0000e+00,  2.0010e+03,  ..., -4.1043e-02,
          -3.6608e-02,  8.6906e-02],
         [ 3.0000e+00,  3.0000e+00,  1.9910e+03,  ..., -1.4034e-02,
          -8.5957e-03,  2.8013e-02],
         ...,
         [ 1.7166e+04,  3.0300e+02,  2.0040e+03,  ...,  8.7461e-02,
           4.9498e-02,  4.3295e-02],
         [ 8.1200e+02,  1.0000e+00,  1.9960e+03,  ...,  6.5374e-03,
           3.1408e-02, -1.2539e-02],
   

In [12]:
edge_index_dict = {etype: data[etype].edge_index for etype in data.edge_types}

In [13]:
edge_index_dict

{('user',
  'rates',
  'book'): tensor([[276724, 276725, 276726,  ..., 276705, 276708, 276720],
         [  2966, 225812,  11053,  ...,  52540,  15978,  56814]]),
 ('book',
  'rev_rates',
  'user'): tensor([[  2966, 225812,  11053,  ...,  52540,  15978,  56814],
         [276724, 276725, 276726,  ..., 276705, 276708, 276720]])}