In [None]:
import pandas as pd
import numpy as np
import torch
import pickle
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from torch import Tensor
import tqdm
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
import wandb
import datetime
import os
import yaml
import logging
import h5py

In [None]:
# Configure logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Loading data and processing data

In [None]:
# Load csv file ratings, books_clean and user_fav_genres
users_2_books = pd.read_csv('data/ratings.csv')
books_2_genres = pd.read_csv('data/books_clean.csv', converters={"genres": lambda x: x.strip("[]").replace("'","").replace(" ","")})
users_2_genres = pd.read_csv('data/user_genres.csv', converters={"genres": lambda x: x.strip("[]").replace("'","").replace(" ","")})

## Book features from genres

In [None]:
book_genres = books_2_genres['genres'].str.get_dummies(",")
logger.info(f"Book genres: {book_genres.columns}")

In [None]:
book_feat = torch.from_numpy(book_genres.values).to(torch.float)
assert book_feat.size() == (10000, 39)  

## User features from genres

In [None]:
unique_user_id = users_2_books['user_id'].unique()
unique_user_id = np.sort(unique_user_id)

# Now we want to make sure that all the user id are in the user_2_genres
# We will add the missing user id with all genres as []
counter = 0
for user_id in unique_user_id:
    if user_id not in users_2_genres["user_id"].values:
        counter += 1
        df_to_append = pd.DataFrame([{"user_id": user_id, "genres": ""}])
        users_2_genres = pd.concat([users_2_genres, df_to_append], ignore_index=True)

logger.info(f"Number of users added: {counter}")

In [None]:
user_genres = users_2_genres['genres'].str.get_dummies(",")

# Add the missing columns to the user_genres to have the same number of columns on user_genres and book_genres
for column in book_genres.columns:
    if column not in user_genres.columns:
        user_genres[column] = 0
        
logger.info(f"User genres: {user_genres.columns}")

user_genres.head()
user_feat = torch.from_numpy(user_genres.values).to(torch.float)

## Edge Index and mapping between user and book

In [None]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = users_2_books['user_id'].unique()

unique_user_id = np.sort(unique_user_id)
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id,
    'mapped_id': pd.RangeIndex(len(unique_user_id)),
})

logger.info(f"Mapping of user IDs to consecutive values: \n {unique_user_id.head()}")


In [None]:
# Create a mapping from unique book indices to range [0, num_book_nodes):
unique_book_id = users_2_books['book_id'].unique()
unique_book_id = np.sort(unique_book_id)
unique_book_id = pd.DataFrame(data={
    'book_id': unique_book_id,
    'mapped_id': pd.RangeIndex(len(unique_book_id)),
})

logger.info(f"Mapping of book IDs to consecutive values: \n {unique_book_id.head()}")

In [None]:
# Perform merge to obtain the edges from users and books:
ratings_user_id = pd.merge(users_2_books['user_id'], unique_user_id,
                            left_on='user_id', right_on='user_id', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mapped_id'].values)
ratings_book_id = pd.merge(users_2_books['book_id'], unique_book_id,
                            left_on='book_id', right_on='book_id', how='left')
ratings_book_id = torch.from_numpy(ratings_book_id['mapped_id'].values)

edge_index_user_to_book = torch.stack([ratings_user_id, ratings_book_id], dim=0)

logger.info(f"Final edge indices pointing from users to books: \n {edge_index_user_to_book}")         

## Hetero data initialization

In [None]:
data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["book"].node_id = torch.arange(len(books_2_genres))

# Add the node features and edge indices:
data["book"].x = book_feat
data["user"].x = user_feat

data["user", "rates", "book"].edge_index = edge_index_user_to_book

# `T.ToUndirected()` makes sure to add the reverse edges from books to users to let the GNN pass messages in both directions.
data = T.ToUndirected()(data)

# Functions

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        
    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_book: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_book = x_book[edge_label_index[1]]
        # Apply dot-product to get a prediction 
        return (edge_feat_user * edge_feat_book).sum(dim=-1)

class Model(torch.nn.Module):
    
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.book_lin = torch.nn.Linear(39, hidden_channels)
        self.user_lin = torch.nn.Linear(39, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.book_emb = torch.nn.Embedding(data["book"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
        
    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_lin(data["user"].x) + self.user_emb(data["user"].node_id),
          "book": self.book_lin(data["book"].x) + self.book_emb(data["book"].node_id),
        } 
        
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["book"],
            data["user", "rates", "book"].edge_label_index,
        )
        return pred
        


In [None]:
def validate(model, val_loader, device):
    model.eval()
    preds = []
    ground_truths = []
    for sampled_data in tqdm.tqdm(val_loader):
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["user", "rates", "book"].edge_label)            
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    acc = balanced_accuracy_score(ground_truth, [sigmoid(pred[i]) >= 0.5 for i in range(len(pred))])
    
    return auc, acc

def sigmoid(x):                                        
    return 1 / (1 + np.exp(-x))

def save_hetero_data(data: HeteroData, file_path: str) -> None:
    with h5py.File(file_path, 'w') as f:
        for node_type in data.node_types:
            if 'x' in data[node_type]:
                f.create_dataset(f'{node_type}/x', data=data[node_type].x.cpu().numpy())
        for edge_type in data.edge_types:
            f.create_dataset(f'{edge_type}/edge_index', data=data[edge_type].edge_index.cpu().numpy())
            if 'edge_label' in data[edge_type]:
                f.create_dataset(f'{edge_type}/edge_label', data=data[edge_type].edge_label.cpu().numpy())
                
def test(model, test_loader, device):
    model.eval()
    preds = []
    ground_truths = []
    for sampled_data in tqdm.tqdm(test_loader):
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["user", "rates", "book"].edge_label)            
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    acc = balanced_accuracy_score(ground_truth, [sigmoid(pred[i]) >= 0.5 for i in range(len(pred))])
    
    return auc, acc

# Wandb Sweep

In [None]:
#read the .env file
wandb_token = os.getenv("NML_ACCESS_TOKEN")
wandb.login(key=wandb_token)

# Initialize wandb
wandb_project_name = 'Final_sweep_Book_user_feat'
date_time = datetime.datetime.now().strftime("%m_%d_%H_%M_%S")
day_time = datetime.datetime.now().strftime("%m_%d")

In [None]:
def upgrade_file_version(folder_path: str) -> int:
    new_version = 1
    for f in os.listdir(folder_path):
        if f.startswith(f'{date_time}'):
            file_path = os.path.join(folder_path, f) 
            version = file_path.split("_")[-1].split(".")[0][1:]
            new_version = int(version) + 1
    return new_version
# read sweep_config.yaml file
with open("sweep_config.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

sweep_id = wandb.sweep(sweep_config, project=wandb_project_name)

logger.info(f"Sweep config: {sweep_config}")
logger.info(f"Sweep id: {sweep_id}")

In [None]:
def train(config=None):
    
    with wandb.init(config=config):
        
        config = wandb.config
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Device: {device}")
    
        # Create splits data
        transform = T.RandomLinkSplit(
            num_val=0.1,
            num_test=0.1,
            disjoint_train_ratio=0.3,
            neg_sampling_ratio=config.negative_sampling_ratio,
            add_negative_train_samples=False,
            edge_types=("user", "rates", "book"),
            rev_edge_types=("book", "rev_rates", "user"), 
        )
        train_data, val_data, test_data = transform(data)
        
        # Create loaders
        edge_label_index = train_data["user", "rates", "book"].edge_label_index
        edge_label = train_data["user", "rates", "book"].edge_label
        train_loader = LinkNeighborLoader(
            data=train_data,
            num_neighbors=[config.first_num_neighbours, config.second_num_neighbours],
            neg_sampling_ratio=config.negative_sampling_ratio,
            edge_label_index=(("user", "rates", "book"), edge_label_index),
            edge_label=edge_label,
            batch_size=config.batch_size,
            shuffle=True,
        )
        edge_label_index = val_data["user", "rates", "book"].edge_label_index
        edge_label = val_data["user", "rates", "book"].edge_label
        val_loader = LinkNeighborLoader(
            data=val_data,
            num_neighbors=[config.first_num_neighbours, config.second_num_neighbours],
            edge_label_index=(("user", "rates", "book"), edge_label_index),
            edge_label=edge_label,
            batch_size=(1+config.negative_sampling_ratio) * config.batch_size,
            shuffle=False,
        )
        
        edge_label_index = test_data["user", "rates", "book"].edge_label_index
        edge_label = test_data["user", "rates", "book"].edge_label
        test_loader = LinkNeighborLoader(
            data=test_data,
            num_neighbors=[config.first_num_neighbours, config.second_num_neighbours],
            edge_label_index=(("user", "rates", "book"), edge_label_index),
            edge_label=edge_label,
            batch_size=(1+config.negative_sampling_ratio) * config.batch_size,
            shuffle=False,
        )
        
        model = Model(hidden_channels=config.hidden_channels)
        model = model.to(device)
        
        if config.optimizer == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
        elif config.optimizer == "SGD": 
            optimizer = torch.optim.SGD(model.parameters(), lr=config.lr)
        else:
            raise ValueError("Invalid optimizer")
        
        
        if not os.path.exists(f'models/book_user_feat/{day_time}_v000'):
            version = 0
        else: 
            folder_path_start = f'models/book_user_feat'
            version = upgrade_file_version(folder_path_start) 

        os.makedirs(f'models/book_user_feat/{day_time}_v{int(version):03d}')
        saving_path = f'models/book_user_feat/{day_time}_v{int(version):03d}'
        logger.info(f"Saving path: {saving_path}")
        
        #save test splits for future testings on the best model
        test_set_path = saving_path + '/test_data.h5'
        save_hetero_data(train_data, test_set_path)
        logger.info(f"Test data saved at: {test_set_path}")
        
        metrics_dict = {}
        
        for epoch in range(0, config.epochs):
            #Training 
            total_loss = total_examples = 0
            pred_list = []
            gt_list = []
            for sampled_data in tqdm.tqdm(train_loader):
                optimizer.zero_grad()
                sampled_data.to(device)
                pred = model(sampled_data)
                ground_truth = sampled_data["user", "rates", "book"].edge_label
                if config.loss == "CrossEntropyLoss":
                    loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
                else:
                    raise ValueError("Invalid loss")
                
                loss.backward()
                optimizer.step()
                total_loss += float(loss) * pred.numel()
                total_examples += pred.numel()
                pred_list.extend(torch.sigmoid(pred).detach().cpu().numpy())
                gt_list.extend(ground_truth.cpu().numpy())
                
            loss = total_loss / total_examples
            acc_train = balanced_accuracy_score(gt_list, [pred_list[i] >= 0.5 for i in range(len(pred_list))])
        
            torch.save(model.state_dict(), saving_path + f'/model_epoch_{epoch}.pth')
            
            # Validatation
            val_auc, acc_val = validate(model, val_loader, device)
            
            # Log the validation AUC to wandb
            wandb.log({'epoch': epoch, 'Validation AUC': val_auc, 'Loss': loss ,'training accuracy': acc_train, 'validation_accuracy': acc_val})
            logger.info(f"Epoch {epoch} - Validation AUC: {val_auc:.4f} - Loss: {loss:.4f} - Training Accuracy: {acc_train:.4f} - Validation Accuracy: {acc_val:.4f}")
            
            metrics_dict[epoch] = {"loss": loss, "auc": val_auc}
            
            #save metrics with pickle
            with open(saving_path + '/metrics_dict.pkl', 'wb') as f:
                pickle.dump(metrics_dict, f)
        
        torch.save(model.state_dict(), saving_path + f'/final_model.pth')
        
        # Test the model
        with torch.no_grad():
            test_auc, acc_test = test(model, test_loader, device)
        
        logger.info(f"Test AUC: {test_auc:.4f} - Test Accuracy: {acc_test:.4f}")
        
        # Finish the wandb run
        wandb.finish()
            

In [None]:
wandb.agent(sweep_id, train, count=20)