In [1]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
#import matplotlib.pyplot as plt 
#import seaborn as sns 
from torch.nn import Conv1d, MaxPool1d, Linear, Dropout, BCEWithLogitsLoss
from sklearn.metrics import roc_auc_score, average_precision_score

import torch
import networkx as nx
import  torch_geometric
import numpy as np
import dill as pickle

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [None]:
url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')

In [None]:
p = ""
ratings = pd.read_csv(p+'BX-Book-Ratings.csv', sep=';', encoding='latin-1')
users = pd.read_csv(p+'BX-Users.csv', sep=';', encoding='latin-1')
books = pd.read_csv(p+'BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines="skip")

  books = pd.read_csv(p+'BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines="skip")


### Data Preprocessing -> Data Generation

In [4]:
ratings_filtered = ratings.loc[ratings["Book-Rating"] >= 8]
#sns.countplot(x =ratings_filtered["Book-Rating"])
ratings_filtered = ratings_filtered.loc[ratings_filtered['ISBN'].isin(books['ISBN'].unique()) & ratings_filtered['User-ID'].isin(users['User-ID'].unique())]

# Mappings
mapping_user = { user_id: index for index, user_id in enumerate(ratings_filtered["User-ID"].unique())}
mapping_item = { isbn_id: index for index, isbn_id in enumerate(ratings_filtered["ISBN"].unique())}

df_mapping_user = pd.DataFrame()
df_mapping_user["user_id"] = mapping_user.keys()
df_mapping_user["user_id_mapped"] = mapping_user.values()

df_mapping_item = pd.DataFrame()
df_mapping_item["isbn_id"] = mapping_item.keys()
df_mapping_item["isbn_id_mapped"] = mapping_item.values()

print(f'df_mapping_user: ')
print(f'{df_mapping_user.head()}')
print("=="*10)
print(f'df_mapping_item: ')
print(f'{df_mapping_item.head()}')

ratings_filtered_m = ratings_filtered.merge(df_mapping_user, left_on = "User-ID", right_on="user_id", how = "left")
ratings_filtered_m = ratings_filtered_m.merge(df_mapping_item, left_on = "ISBN", right_on="isbn_id", how = "left")
ratings_filtered_m.head()

df_mapping_user: 
   user_id  user_id_mapped
0   276747               0
1   276751               1
2   276754               2
3   276762               3
4   276772               4
df_mapping_item: 
      isbn_id  isbn_id_mapped
0  0060517794               0
1  0671537458               1
2  0679776818               2
3  3596218098               3
4  0684867621               4


Unnamed: 0,User-ID,ISBN,Book-Rating,user_id,user_id_mapped,isbn_id,isbn_id_mapped
0,276747,60517794,9,276747,0,60517794,0
1,276747,671537458,9,276747,0,671537458,1
2,276747,679776818,8,276747,0,679776818,2
3,276751,3596218098,8,276751,1,3596218098,3
4,276754,684867621,8,276754,2,684867621,4


In [5]:
ratings_filtered_m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223807 entries, 0 to 223806
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   User-ID         223807 non-null  int64 
 1   ISBN            223807 non-null  object
 2   Book-Rating     223807 non-null  int64 
 3   user_id         223807 non-null  int64 
 4   user_id_mapped  223807 non-null  int64 
 5   isbn_id         223807 non-null  object
 6   isbn_id_mapped  223807 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 12.0+ MB


In [6]:
grouped = ratings_filtered_m.groupby(["isbn_id"]).mean(numeric_only=True)
grouped = grouped.sort_values(['Book-Rating'], ascending=False)
grouped.to_csv("ISBNS_grouped.csv")
grouped

Unnamed: 0_level_0,User-ID,Book-Rating,user_id,user_id_mapped,isbn_id_mapped
isbn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9971400162,114865.0,10.0,114865.0,19984.0,57432.0
0001821326,201017.0,10.0,201017.0,34204.0,80786.0
0001374869,10067.0,10.0,10067.0,1999.0,6861.0
B0001I1KOG,148258.0,10.0,148258.0,25555.0,66369.0
0001360469,10067.0,10.0,10067.0,1999.0,6860.0
...,...,...,...,...,...
0679803726,93047.0,8.0,93047.0,16189.0,45604.0
075350426X,203456.0,8.0,203456.0,34585.0,81276.0
0753504545,236727.0,8.0,236727.0,40280.0,89131.0
0753505045,183316.0,8.0,183316.0,31357.0,76440.0


In [7]:
# edge_index_user_to_isbn
edge_index_user_to_isbn = ratings_filtered_m[["user_id_mapped", "isbn_id_mapped"]]
edge_index_user_to_isbn_user = torch.from_numpy(edge_index_user_to_isbn.user_id_mapped.values)
edge_index_user_to_isbn_isbn =torch.from_numpy( edge_index_user_to_isbn.isbn_id_mapped.values)
edge_index_user_to_isbn_user
edge_index_user_to_isbn_isbn

edge_index_user_to_isbn = torch.stack([edge_index_user_to_isbn_user, edge_index_user_to_isbn_isbn], dim=0)
edge_index_user_to_isbn

tensor([[    0,     0,     0,  ..., 47071, 47072, 47073],
        [    0,     1,     2,  ..., 98416, 12023, 79963]])

In [8]:
book_x = books[["ISBN", "Year-Of-Publication", "Publisher"]]
book_x = book_x[(book_x["ISBN"].isin(list(ratings_filtered_m.isbn_id.unique())) )]
#ratings_filtered_m.isbn_id.unique()

print(len(book_x["ISBN"].values))
print(len(ratings_filtered_m.isbn_id.unique()))

#df_mapping_item, left_on = "ISBN", right_on="isbn_id", how = "left"
book_x = book_x.merge(df_mapping_item,left_on =  "ISBN",right_on = "isbn_id", how= "left")
book_x = book_x.sort_values(by=['isbn_id_mapped'])

book_x = book_x.set_index('isbn_id_mapped')
book_x = book_x[["Year-Of-Publication", "Publisher"]]

print(book_x.info())

for c in [ "Year-Of-Publication", "Publisher"]:
    
    print(f'--- {c} {book_x[c].nunique()}')
    
book_x = pd.get_dummies(book_x, columns=["Publisher"], prefix=["publisher"])
book_x.replace({True:1, False:0}, inplace=True)
book_x["Year-Of-Publication"]= book_x["Year-Of-Publication"].astype(int)


display(book_x.head(2))


print("After Transformation")
print(book_x.info())


98417
98417
<class 'pandas.core.frame.DataFrame'>
Index: 98417 entries, 0 to 98416
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Year-Of-Publication  98417 non-null  object
 1   Publisher            98415 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB
None
--- Year-Of-Publication 166
--- Publisher 8750


  book_x.replace({True:1, False:0}, inplace=True)


KeyboardInterrupt: 

In [None]:
torch_isbn = torch.tensor(book_x.values)
torch_isbn.shape


data = HeteroData()

data["user"].node_id = torch.from_numpy(ratings_filtered_m.user_id_mapped.unique())
data["isbn"].node_id =torch.from_numpy(ratings_filtered_m.isbn_id_mapped.unique())
data["isbn"].x = torch_isbn
data["user", "review", "isbn"].edge_index  = edge_index_user_to_isbn
data = T.ToUndirected()(data)
print(data)

HeteroData(
  user={ node_id=[47074] },
  isbn={
    node_id=[98417],
    x=[98417, 8751],
  },
  (user, review, isbn)={ edge_index=[2, 223807] },
  (isbn, rev_review, user)={ edge_index=[2, 223807] }
)


### Data Transformation -> Random Splitter + LinkLoader

In [None]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "review", "isbn"),
    rev_edge_types=("isbn", "rev_review", "user"), 
)
train_data, val_data, test_data = transform(data)

In [None]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "review", "isbn"].edge_label_index
edge_label = train_data["user", "review", "isbn"].edge_label


In [None]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[5, 5],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "review", "isbn"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    
     data=val_data,
    num_neighbors=[5, 5],
    #neg_sampling_ratio=2.0,
    edge_label_index=(("user", "review", "isbn"), val_data["user", "review", "isbn"].edge_label_index),
    edge_label=val_data["user", "review", "isbn"].edge_label,
    batch_size=128,
    shuffle=True,
    )

test_loader = LinkNeighborLoader(
    
     data=test_data,
    num_neighbors=[5, 5],
    #neg_sampling_ratio=2.0,
    edge_label_index=(("user", "review", "isbn"), test_data["user", "review", "isbn"].edge_label_index),
    edge_label=test_data["user", "review", "isbn"].edge_label,
    batch_size=128,
    shuffle=True,
    )
#test_loader = LinkNeighborLoader(test_dataset, batch_size=32)

In [None]:
train_data

HeteroData(
  [1muser[0m={ node_id=[47074] },
  [1misbn[0m={
    node_id=[98417],
    x=[98417, 8751]
  },
  [1m(user, review, isbn)[0m={
    edge_index=[2, 125333],
    edge_label=[53714],
    edge_label_index=[2, 53714]
  },
  [1m(isbn, rev_review, user)[0m={ edge_index=[2, 125333] }
)

In [None]:
sample_data = next(iter(train_loader))
sample_data

HeteroData(
  user={
    node_id=[2010],
    n_id=[2010],
    num_sampled_nodes=[3],
  },
  isbn={
    node_id=[2514],
    x=[2514, 8751],
    n_id=[2514],
    num_sampled_nodes=[3],
  },
  (user, review, isbn)={
    edge_index=[2, 2638],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[2638],
    num_sampled_edges=[2],
    input_id=[128],
  },
  (isbn, rev_review, user)={
    edge_index=[2, 2601],
    e_id=[2601],
    num_sampled_edges=[2],
  }
)


### Model Generation

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

Device: 'cpu'


In [None]:
data.metadata

<bound method HeteroData.metadata of HeteroData(
  user={ node_id=[47074] },
  isbn={
    node_id=[98417],
    x=[98417, 8751],
  },
  (user, review, isbn)={ edge_index=[2, 223807] },
  (isbn, rev_review, user)={ edge_index=[2, 223807] }
)>

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, x, edge_index) :
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x 
    
class Classifier(torch.nn.Module): 
    def forward(self, x_user, x_movie, edge_label_index) :
        
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)
    
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.movie_lin = torch.nn.Linear(8751, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["isbn"].num_nodes, hidden_channels)
        self.gnn = GNN(hidden_channels)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
        
    def forward(self, data: HeteroData) :
    
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "isbn": self.movie_lin(data["isbn"].x.float()) + self.movie_emb(data["isbn"].node_id),
        } 
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["isbn"],
            data["user", "review", "isbn"].edge_label_index,
        )
        return pred
    
model = Model(hidden_channels=64)

### Training and Validation

In [None]:


optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
criterion = BCEWithLogitsLoss()

def train():
    model.train()
    total_loss = total_examples = 0

    for sampled_data in tqdm.tqdm(train_loader):
        sampled_data.to(device)
        optimizer.zero_grad()
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "review", "isbn"].edge_label
        loss = criterion(pred, ground_truth)
        
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    #print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")
    return total_loss / total_examples


@torch.no_grad()
def test(loader):
    model.eval()
    y_pred, y_true = [], []
    
    for s_data in tqdm.tqdm(loader):
        
        s_data = s_data.to(device)
        out = model(s_data)
        y_pred.append(out)
        ground_truth = s_data["user", "review", "isbn"].edge_label
        y_true.append(ground_truth)
    auc = roc_auc_score(torch.cat(y_true), torch.cat(y_pred))
    ap = average_precision_score(torch.cat(y_true), torch.cat(y_pred))

    return auc, ap

In [None]:
for epoch in range(30):#30 epochen
    loss = train()
    
    if epoch % 5 == 0:
        val_auc, val_ap = test(val_loader)
        print(f'Epoch {epoch:>2} | Loss: {loss:.4f} | Val AUC: {val_auc:.4f} | Val AP: {val_ap:.4f}')

  0%|          | 0/420 [00:00<?, ?it/s]

100%|██████████| 420/420 [04:53<00:00,  1.43it/s]
100%|██████████| 525/525 [02:30<00:00,  3.50it/s]


Epoch  0 | Loss: 2.1353 | Val AUC: 0.5798 | Val AP: 0.3638


100%|██████████| 420/420 [05:11<00:00,  1.35it/s]
100%|██████████| 420/420 [03:29<00:00,  2.01it/s]
100%|██████████| 420/420 [03:17<00:00,  2.13it/s]
100%|██████████| 420/420 [03:18<00:00,  2.12it/s]
100%|██████████| 420/420 [03:17<00:00,  2.12it/s]
100%|██████████| 525/525 [01:35<00:00,  5.50it/s]


Epoch  5 | Loss: 0.5514 | Val AUC: 0.7475 | Val AP: 0.5820


100%|██████████| 420/420 [04:05<00:00,  1.71it/s]
100%|██████████| 420/420 [03:58<00:00,  1.76it/s]
100%|██████████| 420/420 [05:21<00:00,  1.31it/s]
100%|██████████| 420/420 [04:03<00:00,  1.73it/s]
100%|██████████| 420/420 [03:16<00:00,  2.13it/s]
100%|██████████| 525/525 [01:35<00:00,  5.51it/s]


Epoch 10 | Loss: 0.4598 | Val AUC: 0.8204 | Val AP: 0.7004


100%|██████████| 420/420 [03:18<00:00,  2.11it/s]
100%|██████████| 420/420 [03:17<00:00,  2.13it/s]
100%|██████████| 420/420 [03:17<00:00,  2.13it/s]
100%|██████████| 420/420 [03:19<00:00,  2.11it/s]
100%|██████████| 420/420 [03:17<00:00,  2.12it/s]
100%|██████████| 525/525 [01:36<00:00,  5.45it/s]


Epoch 15 | Loss: 0.4082 | Val AUC: 0.8540 | Val AP: 0.7585


100%|██████████| 420/420 [03:18<00:00,  2.12it/s]
100%|██████████| 420/420 [03:35<00:00,  1.95it/s]
100%|██████████| 420/420 [03:41<00:00,  1.90it/s]
100%|██████████| 420/420 [02:55<00:00,  2.39it/s]
100%|██████████| 420/420 [02:31<00:00,  2.77it/s]
100%|██████████| 525/525 [01:18<00:00,  6.66it/s]


Epoch 20 | Loss: 0.3743 | Val AUC: 0.8722 | Val AP: 0.7856


100%|██████████| 420/420 [02:31<00:00,  2.78it/s]
100%|██████████| 420/420 [02:33<00:00,  2.74it/s]
100%|██████████| 420/420 [02:32<00:00,  2.75it/s]
100%|██████████| 420/420 [02:34<00:00,  2.72it/s]
100%|██████████| 420/420 [02:31<00:00,  2.78it/s]
100%|██████████| 525/525 [01:17<00:00,  6.76it/s]


Epoch 25 | Loss: 0.3481 | Val AUC: 0.8749 | Val AP: 0.7904


100%|██████████| 420/420 [02:30<00:00,  2.79it/s]
100%|██████████| 420/420 [02:29<00:00,  2.80it/s]
100%|██████████| 420/420 [02:32<00:00,  2.76it/s]
100%|██████████| 420/420 [02:40<00:00,  2.61it/s]


In [None]:
val_auc, val_ap = test(val_loader)
print(f'Epoch {epoch:>2} | Loss: {loss:.4f} | Val AUC: {val_auc:.4f} | Val AP: {val_ap:.4f}')

100%|██████████| 525/525 [01:37<00:00,  5.39it/s]


Epoch 29 | Loss: 0.3299 | Val AUC: 0.8807 | Val AP: 0.8026


### Test Model

In [None]:
t_auc, t_ap = test(test_loader)
print(f' Val AUC: {t_auc:.4f} | Val AP: {t_ap:.4f}')

100%|██████████| 525/525 [01:40<00:00,  5.24it/s]

 Val AUC: 0.8723 | Val AP: 0.7713





Val AUC: 0.8724 | Val AP: 0.7822

### Safe Model

In [None]:
PATH = ""
torch.save(model.state_dict(), PATH)

In [None]:
model = Model(hidden_channels=64)
model.load_state_dict(torch.load(PATH, weights_only=True))
model.eval()

Model(
  (movie_lin): Linear(in_features=8751, out_features=64, bias=True)
  (user_emb): Embedding(47074, 64)
  (movie_emb): Embedding(98417, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__review__isbn): SAGEConv(64, 64, aggr=mean)
      (isbn__rev_review__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__review__isbn): SAGEConv(64, 64, aggr=mean)
      (isbn__rev_review__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)