# installing libraries

In [1]:
!pip install torch_geometric



In [2]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


# importing libraries

In [3]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, SAGEConv, GCNConv
from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split, KFold
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim.lr_scheduler import ReduceLROnPlateau
import logging
import json
from typing import Dict, List, Tuple, Optional
import warnings
from tqdm import tqdm
import itertools
from sklearn.manifold import TSNE
import seaborn as sns
import warnings
import umap
warnings.filterwarnings("ignore")

# Set up logging

In [4]:

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='training.log'
)

# Predicting new collaborations

In [5]:
def predict_new_collaborations(model, data, nodes_df, node_index_map, num_predictions=10):
    """Predict potential new collaborations."""
    device = data.x.device
    model.eval()
    with torch.no_grad():
        num_nodes = data.num_nodes
        # generate potential collaborations
        unseen_pairs = generate_negative_edges(num_nodes, data.edge_index, 1000).to(device)

        embeddings = model(data.x, data.edge_index)
        predictions = model.predict_edges(embeddings, unseen_pairs)

        top_indices = torch.argsort(predictions, descending=True)[:num_predictions]
        top_pairs = unseen_pairs[top_indices]
        top_scores = predictions[top_indices]

        reverse_mapping = {v: k for k, v in node_index_map.items()}
        results = []

        for (i, j), score in zip(top_pairs.tolist(), top_scores.tolist()):
            artist1_id = reverse_mapping[i]
            artist2_id = reverse_mapping[j]
            artist1_name = nodes_df[nodes_df['spotify_id'] == artist1_id]['name'].iloc[0]
            artist2_name = nodes_df[nodes_df['spotify_id'] == artist2_id]['name'].iloc[0]

            results.append({
                'artist1': artist1_name,
                'artist2': artist2_name,
                'score': score
            })

        return results

# visualizing TSNE

In [6]:
def visualize_tsne_embeddings(embeddings, nodes_df, genre_columns, filename="tsne_embeddings.png"):
    """Visualize t-SNE embeddings of the nodes."""

    # convert embeddings to CPU if necessary
    if isinstance(embeddings, torch.Tensor):
        embeddings = embeddings.cpu().detach().numpy()

    # reduce embeddings to 2D using t-SNE
    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)

    # select genre-based coloring
    genre_labels = nodes_df[genre_columns].idxmax(axis=1)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=genre_labels, palette="tab10", alpha=0.7)

    plt.title("t-SNE Visualization of Node Embeddings")
    plt.xlabel("t-SNE Dim 1")
    plt.ylabel("t-SNE Dim 2")
    plt.legend(loc="best", fontsize="small")
    plt.savefig(filename)
    plt.close()

# plotting degree distribution

In [7]:
def plot_degree_distribution(edge_index, filename="degree_distribution.png"):
    """Plot the degree distribution of the graph."""

    # compute node degrees
    degrees = torch.bincount(edge_index.flatten()).cpu().numpy()

    plt.figure(figsize=(10, 6))
    plt.hist(degrees, bins=50, log=True, alpha=0.7, color='b', edgecolor='black')
    plt.xlabel("Degree")
    plt.ylabel("Frequency (log scale)")
    plt.title("Node Degree Distribution")
    plt.grid(True)

    plt.savefig(filename)
    plt.close()

# plotting UMAP embeddings

In [8]:
def visualize_umap_embeddings(embeddings, nodes_df, genre_columns, filename="umap_embeddings.png"):
    """Visualize UMAP embeddings of the nodes."""

    # convert embeddings to CPU if necessary
    if isinstance(embeddings, torch.Tensor):
        embeddings = embeddings.cpu().detach().numpy()

    # reduce embeddings to 2D using UMAP
    reducer = umap.UMAP(n_components=2, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    # select genre-based coloring
    genre_labels = nodes_df[genre_columns].idxmax(axis=1)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=genre_labels, palette="tab10", alpha=0.7)

    plt.title("UMAP Visualization of Node Embeddings")
    plt.xlabel("UMAP Dim 1")
    plt.ylabel("UMAP Dim 2")
    plt.legend(loc="best", fontsize="small")
    plt.savefig(filename)
    plt.close()

# Generating negatives edges

In [9]:
def generate_negative_edges(num_nodes, edge_index, num_samples):
    """Generate negative edges (non-collaborations)."""
    existing_edges = set(map(tuple, edge_index.t().tolist()))
    negative_edges = set()

    while len(negative_edges) < num_samples:
        i, j = random.randint(0, num_nodes - 1), random.randint(0, num_nodes - 1)
        if i != j and (i, j) not in existing_edges and (j, i) not in existing_edges:
            negative_edges.add((i, j))

    return torch.tensor(list(negative_edges), dtype=torch.long)

# analyzing false positives

In [10]:
def analyze_false_positives(predictions, nodes_df, genre_columns):
    """Analyze false positives to understand model behavior."""
    analysis = []
    for pred in predictions:
        artist1 = pred['artist1']
        artist2 = pred['artist2']

        # debug print to check if artists are found
        artist1_data = nodes_df[nodes_df['name'] == artist1]
        artist2_data = nodes_df[nodes_df['name'] == artist2]

        if artist1_data.empty or artist2_data.empty:
            print(f"Warning: Could not find data for {artist1 if artist1_data.empty else artist2}")
            continue

        # get genres with explicit boolean conversion
        artist1_genres = artist1_data[genre_columns].iloc[0] > 0
        artist2_genres = artist2_data[genre_columns].iloc[0] > 0

        # get genre names where both artists have True
        shared_genres = [genre for genre, (has1, has2) in
                        zip(genre_columns, zip(artist1_genres, artist2_genres))
                        if has1 and has2]

        # add individual genres for context
        artist1_genre_list = [genre for genre, has_genre in zip(genre_columns, artist1_genres) if has_genre]
        artist2_genre_list = [genre for genre, has_genre in zip(genre_columns, artist2_genres) if has_genre]

        analysis.append({
            'artist1': artist1,
            'artist2': artist2,
            'score': pred['score'],
            'shared_genres': shared_genres,
            'artist1_genres': artist1_genre_list,
            'artist2_genres': artist2_genre_list
        })

    return analysis

# plotting training metrics

In [11]:
def plot_training_metrics(loss_history, val_loss_history, roc_auc_history):
    """plot training metrics over time."""

    # use the length of loss_history for all plots
    epochs = range(1, len(loss_history) + 1)

    # Create x-axis data for ROC-AUC plot, matching the update frequency
    roc_auc_epochs = range(10, len(loss_history) + 1, 10)

    plt.figure(figsize=(15, 5))

    # Plot training loss
    plt.subplot(1, 3, 1)
    plt.plot(epochs, loss_history, 'b-', label='Training Loss')
    plt.plot(epochs, val_loss_history, 'r-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot ROC-AUC using the correct x-axis data
    plt.subplot(1, 3, 2)
    plt.plot(roc_auc_epochs, roc_auc_history, 'g-', label='ROC-AUC')  # Updated line
    plt.title('ROC-AUC Score Evolution')
    plt.xlabel('Epoch')
    plt.ylabel('ROC-AUC')
    plt.legend()

    plt.tight_layout()
    plt.savefig('training_metrics.png')
    plt.close()

# plotting genre performance

In [12]:
def plot_genre_performance(genre_performance):
    """Visualize model performance across different genres."""
    genres = list(genre_performance.keys())
    roc_scores = [perf['roc_auc'] for perf in genre_performance.values()]
    f1_scores = [perf['f1'] for perf in genre_performance.values()]

    plt.figure(figsize=(12, 6))
    x = range(len(genres))
    width = 0.35

    plt.bar([i - width/2 for i in x], roc_scores, width, label='ROC-AUC')
    plt.bar([i + width/2 for i in x], f1_scores, width, label='F1-Score')

    plt.xlabel('Genres')
    plt.ylabel('Score')
    plt.title('Model Performance by Genre')
    plt.xticks(x, genres, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.savefig('genre_performance.png')
    plt.close()

# Validate and analyze input data.

In [13]:
class DataValidator:
    """Validate and analyze input data."""

    @staticmethod
    def validate_dataframe(df: pd.DataFrame, required_columns: List[str]) -> bool:
        """validate DataFrame structure and content."""
        # Check required columns
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            logging.error(f"Missing required columns: {missing_cols}")
            return False

        # Check for null values
        null_counts = df.isnull().sum()
        if null_counts.any():
            logging.warning(f"Null values found:\n{null_counts[null_counts > 0]}")

        return True

    @staticmethod
    def generate_data_report(nodes_df: pd.DataFrame, edges_df: pd.DataFrame, genre_columns: List[str]) -> Dict:
      """Generate comprehensive data statistics report."""

      # Convert 'followers' and 'popularity' columns to numeric before calculating the mean
      nodes_df['followers'] = pd.to_numeric(nodes_df['followers'], errors='coerce')
      nodes_df['popularity'] = pd.to_numeric(nodes_df['popularity'], errors='coerce')

      report = {
          'nodes_stats': {
              'total_nodes': len(nodes_df),
              'avg_followers': nodes_df['followers'].mean(),
              'avg_popularity': nodes_df['popularity'].mean(),
              'genre_distribution': {},
              'missing_data': nodes_df.isnull().sum().to_dict()
          },
          'edges_stats': {
              'total_edges': len(edges_df),
              'unique_artists': len(set(edges_df['id_0'].unique()) | set(edges_df['id_1'].unique())),
              'avg_collaborations_per_artist': len(edges_df) / len(nodes_df)
          }
      }

      # Genre distribution analysis
      for genre in genre_columns:
          # Convert NumPy int64 to native Python int
          report['nodes_stats']['genre_distribution'][genre] = int(nodes_df[genre].sum())

      return report

# Analyzing Genre distribution

In [14]:
def analyze_genre_distribution(nodes_df: pd.DataFrame, genre_columns: List[str]) -> None:
    """Visualize genre distribution in the dataset."""
    genre_counts = {genre: nodes_df[genre].sum() for genre in genre_columns}
    plt.figure(figsize=(12, 6))
    plt.bar(genre_counts.keys(), genre_counts.values())
    plt.xticks(rotation=45, ha='right')
    plt.title('Genre Distribution')
    plt.tight_layout()
    plt.savefig('genre_distribution.png')
    plt.close()

# GAT MODEL implementation

In [15]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, heads=8):
        """Graph Attention Network for link prediction."""
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=False, dropout=0.6)
        self.link_predictor = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.6),
            torch.nn.Linear(hidden_channels, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    def predict_edges(self, x, edge_index):
        row, col = edge_index.t()
        edge_rep = torch.cat([x[row], x[col]], dim=1)
        return self.link_predictor(edge_rep).squeeze()

In [27]:
# enhanced data loading with validation
def load_data(validate: bool = True) -> Tuple:
    """Load and preprocess data with validation."""
    # set low_memory=False to handle mixed types warning
    nodes_df = pd.read_csv("nodes_cleaned.csv",
                          dtype={'spotify_id': str},
                          low_memory=False)
    edges_df = pd.read_csv("edges_cleaned.csv")

    if validate:
      print("  • Validating data structure...")
      required_columns = ['spotify_id', 'name', 'followers', 'popularity']
      if not DataValidator.validate_dataframe(nodes_df, required_columns):
          raise ValueError("Data validation failed")

    # handle null values before processing
    nodes_df['followers'] = pd.to_numeric(nodes_df['followers'], errors='coerce')
    nodes_df['popularity'] = pd.to_numeric(nodes_df['popularity'], errors='coerce')

    # drop rows with null values in critical columns
    critical_columns = ['spotify_id', 'name', 'followers', 'popularity']
    nodes_df = nodes_df.dropna(subset=critical_columns)

    # rest of the preprocessing remains the same...
    nodes_df = nodes_df.drop_duplicates(subset=['spotify_id']).reset_index(drop=True)
    nodes_df = nodes_df[nodes_df['unknown'] == 0].reset_index(drop=True)

    # drop Unnamed columns if they exist
    unnamed_cols = [col for col in nodes_df.columns if 'Unnamed:' in col]
    if unnamed_cols:
        nodes_df = nodes_df.drop(columns=unnamed_cols)

    # genre handling remains the same
    for col in ['alternative Indie', 'folk world']:
        if col in nodes_df.columns:
            nodes_df[col] = nodes_df[col].astype(str)

    # split combined columns remains the same
    for col_pair, combined_col in [(['alternative', 'Indie'], 'alternative Indie'),
                                   (['folk', 'world'], 'folk world')]:
        if combined_col in nodes_df.columns:
            dummy_df = nodes_df[combined_col].str.get_dummies(sep=' ')
            for col in col_pair:
                if col not in dummy_df.columns:
                    dummy_df[col] = 0
            nodes_df[col_pair] = dummy_df[col_pair].fillna(0)
            nodes_df = nodes_df.drop(columns=[combined_col])

    genre_columns = [
        'alternative', 'Indie', 'classical_orchestral', 'electronic',
        'folk', 'world', 'jazz', 'hip_hop', 'latin', 'metal', 'pop',
        'randb_Soul', 'reggae_dancehall', 'rock', 'soundtrack'
    ]

    # Generate data report
    report = DataValidator.generate_data_report(nodes_df, edges_df, genre_columns)
    with open('data_report.json', 'w') as f:
        json.dump(report, f, indent=4)

    # Visualize genre distribution
    analyze_genre_distribution(nodes_df, genre_columns)

    # Continue with original preprocessing
    for col in ['followers', 'popularity']:
        # Convert the column to numeric, coercing errors to NaN
        nodes_df[col] = pd.to_numeric(nodes_df[col], errors='coerce')
        # Fill NaN values with 0
        nodes_df[col] = nodes_df[col].fillna(0)  # Change: Fill NaN with 0
        # Now normalize
        nodes_df[col] = (nodes_df[col] - nodes_df[col].min()) / (nodes_df[col].max() - nodes_df[col].min())

    node_index_map = {spotify_id: idx for idx, spotify_id in enumerate(nodes_df['spotify_id'])}

    valid_edges = edges_df[
        edges_df['id_0'].isin(node_index_map) &
        edges_df['id_1'].isin(node_index_map)
    ]

    edge_index = torch.tensor([
        [node_index_map[id_] for id_ in valid_edges['id_0']],
        [node_index_map[id_] for id_ in valid_edges['id_1']]
    ], dtype=torch.long)

    node_features_df = nodes_df[['followers', 'popularity'] + genre_columns]
    node_features_df = node_features_df.apply(pd.to_numeric, errors='coerce').fillna(0)
    node_features = torch.tensor(node_features_df.values, dtype=torch.float)
    # Add this to load_data():
    print(f"Total artists: {len(nodes_df)}")
    print(f"Artists with genre info: {len(nodes_df[nodes_df[genre_columns].sum(axis=1) > 0])}")

    return nodes_df, edge_index, node_features, node_index_map, genre_columns

# Alternative GNN architectures

In [28]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.link_predictor = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.6),
            torch.nn.Linear(hidden_channels, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    def predict_edges(self, x, edge_index):
        row, col = edge_index.t()
        edge_rep = torch.cat([x[row], x[col]], dim=1)
        return self.link_predictor(edge_rep).squeeze()

# GraphSage model implementation

In [29]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.link_predictor = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.6),
            torch.nn.Linear(hidden_channels, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    def predict_edges(self, x, edge_index):
        row, col = edge_index.t()
        edge_rep = torch.cat([x[row], x[col]], dim=1)
        return self.link_predictor(edge_rep).squeeze()

# Validate model architecture and analyze complexity.

In [30]:
class ModelValidator:
    """Validate model architecture and analyze complexity."""

    @staticmethod
    def count_parameters(model: torch.nn.Module) -> int:
        """Count trainable parameters in the model."""
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    @staticmethod
    def analyze_model_complexity(model: torch.nn.Module) -> Dict:
        """Analyze model complexity and architecture."""
        return {
            'total_params': ModelValidator.count_parameters(model),
            'layers': [
                {
                    'name': name,
                    'params': sum(p.numel() for p in module.parameters() if p.requires_grad)
                }
                for name, module in model.named_children()
            ]
        }

# Tune model hyperparameters.

In [31]:
class HyperparameterTuner:
    """Tune model hyperparameters."""

    @staticmethod
    def grid_search(model_class, data, param_grid: Dict, n_splits: int = 5) -> Dict:
        """Perform grid search for hyperparameter tuning."""
        device = data.x.device  # Get the device from data
        best_params = {}
        best_score = 0

        # Initialize with default values
        best_params['hidden_channels'] = param_grid['hidden_channels'][0]
        best_params['heads'] = param_grid['heads'][0]

        for params in tqdm(HyperparameterTuner._parameter_combinations(param_grid)):
            scores = []
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

            for train_idx, val_idx in kf.split(range(data.num_nodes)):
                # Create model and move to correct device
                model = model_class(
                    in_channels=params['in_channels'],
                    hidden_channels=params['hidden_channels'],
                    heads=params['heads']
                ).to(device)

                score = HyperparameterTuner._validate_model(model, data, train_idx, val_idx)
                scores.append(score)

            avg_score = np.mean(scores)
            if avg_score > best_score:
                best_score = avg_score
                best_params = params

        return best_params

    @staticmethod
    def _validate_model(model, data, train_idx, val_idx) -> float:
        """Validate model performance for a specific parameter combination."""
        device = data.x.device
        train_idx = torch.tensor(train_idx, device=device)
        val_idx = torch.tensor(val_idx, device=device)

        model.train()

        # Simple forward pass and validation
        with torch.no_grad():
            embeddings = model(data.x, data.edge_index)
            # Simple validation metric (e.g., L2 norm of embeddings)
            score = torch.norm(embeddings[val_idx]).item()

        return score

    @staticmethod
    def _parameter_combinations(param_grid: Dict) -> List[Dict]:
        """Generate all possible parameter combinations."""
        keys = param_grid.keys()
        values = param_grid.values()
        for instance in itertools.product(*values):
            yield dict(zip(keys, instance))

In [32]:
def train_with_validation(model, data, train_edges, val_edges, optimizer, scheduler):
    """Train the model with validation."""
    device = next(model.parameters()).device  # Get the device the model is on

    model.train()
    optimizer.zero_grad()

    # Training
    neg_edges = generate_negative_edges(data.num_nodes, data.edge_index, train_edges.size(0))
    embeddings = model(data.x, data.edge_index)

    pos_pred = model.predict_edges(embeddings, train_edges)
    neg_pred = model.predict_edges(embeddings, neg_edges)

    pred = torch.cat([pos_pred, neg_pred])
    labels = torch.cat([
        torch.ones(pos_pred.size(0), device=device),
        torch.zeros(neg_pred.size(0), device=device)
    ])

    loss = F.binary_cross_entropy(pred, labels.float())
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_neg_edges = generate_negative_edges(data.num_nodes, data.edge_index, val_edges.size(0))
        val_embeddings = model(data.x, data.edge_index)

        val_pos_pred = model.predict_edges(val_embeddings, val_edges)
        val_neg_pred = model.predict_edges(val_embeddings, val_neg_edges)

        val_pred = torch.cat([val_pos_pred, val_neg_pred])
        val_labels = torch.cat([
            torch.ones(val_pos_pred.size(0), device=val_pred.device),
            torch.zeros(val_neg_pred.size(0), device=val_pred.device)
        ])

        val_loss = F.binary_cross_entropy(val_pred, val_labels.float())

    # Update learning rate
    scheduler.step(val_loss)

    return loss.item(), val_loss.item()

In [33]:
def evaluate_model_performance(model, data, test_edges):
    """Comprehensive model evaluation."""
    model.eval()
    device = next(model.parameters()).device  # Get the device the model is on

    with torch.no_grad():
        neg_edges = generate_negative_edges(data.num_nodes, data.edge_index, test_edges.size(0))
        embeddings = model(data.x, data.edge_index)

        pos_pred = model.predict_edges(embeddings, test_edges).cpu()
        neg_pred = model.predict_edges(embeddings, neg_edges).cpu()

        pred = torch.cat([pos_pred, neg_pred])
        labels = torch.cat([
            torch.ones(pos_pred.size(0)),
            torch.zeros(neg_pred.size(0))
        ])

        # Calculate metrics
        roc_auc = roc_auc_score(labels, pred)
        precision, recall, _ = precision_recall_curve(labels, pred)
        avg_precision = average_precision_score(labels, pred)
        conf_matrix = confusion_matrix(labels, (pred > 0.5).float())

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.savefig('confusion_matrix.png')
        plt.close()

        # Plot precision-recall curve
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, label=f'AP={avg_precision:.2f}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        plt.savefig('precision_recall_curve.png')
        plt.close()

        return {
            'roc_auc': roc_auc,
            'average_precision': avg_precision,
            'confusion_matrix': conf_matrix
        }

In [34]:
def ensure_device(tensor, device):
    """Ensure a tensor is on the specified device."""
    if tensor.device != device:
        return tensor.to(device)
    return tensor

def analyze_genre_specific_performance(model, data, test_edges, nodes_df, genre_columns):
    """Analyze model performance for specific genres."""
    model.eval()
    device = data.x.device
    genre_performance = {}

    with torch.no_grad():
        embeddings = model(data.x, data.edge_index)

        for genre in genre_columns:
            # Get artists in this genre
            genre_artists = nodes_df[nodes_df[genre] == 1]['spotify_id'].values
            genre_indices = [nodes_df[nodes_df['spotify_id'] == aid].index[0] for aid in genre_artists]
            genre_indices_tensor = ensure_device(torch.tensor(genre_indices), device)

            # Filter test edges involving these artists
            genre_edges = test_edges[
                (test_edges[:, 0].unsqueeze(1) == genre_indices_tensor).any(1) &
                (test_edges[:, 1].unsqueeze(1) == genre_indices_tensor).any(1)
            ]

            if len(genre_edges) > 0:
                neg_edges = generate_negative_edges(data.num_nodes, data.edge_index, genre_edges.size(0))

                pos_pred = model.predict_edges(embeddings, genre_edges).cpu()
                neg_pred = model.predict_edges(embeddings, neg_edges).cpu()

                pred = torch.cat([pos_pred, neg_pred])
                labels = torch.cat([
                    torch.ones(pos_pred.size(0)),
                    torch.zeros(neg_pred.size(0))
                ])

                genre_performance[genre] = {
                    'roc_auc': roc_auc_score(labels, pred),
                    'f1': f1_score(labels, (pred > 0.5).float())
                }

    return genre_performance

In [35]:
def train_with_cross_validation(model_class, data, k_folds=5):
    """Train model using k-fold cross-validation."""
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    all_edges = data.edge_index.t().contiguous()
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(range(len(all_edges)))):
        logging.info(f"Starting fold {fold + 1}/{k_folds}")

        train_edges = all_edges[train_idx]
        val_edges = all_edges[val_idx]

        model = model_class(in_channels=data.num_node_features, hidden_channels=64)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

        best_val_score = 0
        for epoch in range(200):
            loss, val_loss = train_with_validation(model, data, train_edges, val_edges, optimizer, scheduler)

            if epoch % 10 == 0:
                val_metrics = evaluate_model_performance(model, data, val_edges)
                val_score = val_metrics['roc_auc']

                if val_score > best_val_score:
                    best_val_score = val_score
                    torch.save(model.state_dict(), f'best_model_fold_{fold}.pth')

                logging.info(f"Fold {fold + 1}, Epoch {epoch}: Loss = {loss:.4f}, Val Loss = {val_loss:.4f}, Val ROC-AUC = {val_score:.4f}")

        cv_scores.append(best_val_score)
        logging.info(f"Fold {fold + 1} completed. Best validation ROC-AUC: {best_val_score:.4f}")

    return np.mean(cv_scores), np.std(cv_scores)

In [36]:
def save_model(model, path):
    """Save model with proper device handling."""
    torch.save(model.state_dict(), path)  # Change this line to save only the state_dict
def load_model(model, path, device):
    """Load model with proper device handling."""
    model.load_state_dict(torch.load(path, map_location=device))  # Load the state_dict and remove weights_only
    return model

def compare_model_architectures(data, train_edges, val_edges, test_edges):
    """Compare different GNN architectures."""
    device = data.x.device
    models = {
        'GAT': GAT(in_channels=data.num_node_features, hidden_channels=64).to(device),
        'GCN': GCN(in_channels=data.num_node_features, hidden_channels=64).to(device),
        'GraphSAGE': GraphSAGE(in_channels=data.num_node_features, hidden_channels=64).to(device)
    }

    results = {}
    for name, model in models.items():
        logging.info(f"Training {name} model...")
        model = model.to(device)  # Ensure model is on correct device
        optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

        # Training loop
        best_val_score = 0
        for epoch in range(200):
            loss, val_loss = train_with_validation(model, data, train_edges, val_edges, optimizer, scheduler)

            if epoch % 10 == 0:
                val_metrics = evaluate_model_performance(model, data, val_edges)
                val_score = val_metrics['roc_auc']

                if val_score > best_val_score:
                    best_val_score = val_score
                    save_model(model, f'best_{name}_model.pth')

        # Final evaluation
        model = load_model(model, f'best_{name}_model.pth', device)
        test_metrics = evaluate_model_performance(model, data, test_edges)
        results[name] = test_metrics

    return results

# Main function without TSNE visualization:

In [37]:
def main():
    """Enhanced main pipeline with additional features."""
    print("\n=== Starting Project Requirements Tracking ===")
    # Set up device first
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    print(f"\n✓ Requirement 1.1: Device Setup Complete - Using {device}")

    # Load and validate data
    print("\n--- Starting Data Loading and Validation ---")
    nodes_df, edge_index, node_features, node_index_map, genre_columns = load_data(validate=True)
    print("✓ Requirement 1.2: Data Loading and Validation Complete")
    print(f"  • Loaded {len(nodes_df)} artists")
    print(f"  • Processed {edge_index.size(1)} collaborations")
    print(f"  • Extracted {len(genre_columns)} genre features")

    # Move data to device immediately after creation
    data = Data(
        x=node_features.to(device),
        edge_index=edge_index.to(device)
    )
    print("✓ Requirement 1.3: Data Movement to Device Complete")

    # Split data
    print("\n--- Starting Data Splitting ---")
    num_edges = edge_index.size(1)
    all_edges = edge_index.t().contiguous()

    train_edges, temp_edges = train_test_split(all_edges, test_size=0.3, random_state=42)
    val_edges, test_edges = train_test_split(temp_edges, test_size=0.5, random_state=42)

    # Move all edges to device
    train_edges = train_edges.to(device)
    val_edges = val_edges.to(device)
    test_edges = test_edges.to(device)
    print("✓ Requirement 2.1: Data Splitting Complete")
    print(f"  • Training edges: {len(train_edges)}")
    print(f"  • Validation edges: {len(val_edges)}")
    print(f"  • Test edges: {len(test_edges)}")

    # Hyperparameter tuning
    print("\n--- Starting Hyperparameter Tuning ---")
    param_grid = {
        'in_channels': [data.num_node_features],
        'hidden_channels': [32, 64, 128],
        'heads': [4, 8, 16]
    }
    best_params = HyperparameterTuner.grid_search(GAT, data, param_grid)
    print("✓ Requirement 2.2: Hyperparameter Tuning Complete")
    print(f"  • Best parameters found: {best_params}")
    logging.info(f"Best hyperparameters: {best_params}")

    # Initialize model with best parameters
    print("\n--- Starting Model Training ---")
    model = GAT(
        in_channels=data.num_node_features,
        hidden_channels=best_params['hidden_channels'],
        heads=best_params['heads']
    ).to(device)
    print("✓ Requirement 3.1: Model Initialization Complete")

    # Rest of the training pipeline...
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    print("✓ Requirement 3.2: Optimizer and Scheduler Setup Complete")

    # Initialize lists to store metrics
    print("\n--- Starting Training Loop ---")
    loss_history = []
    val_loss_history = []
    roc_auc_history = []

    best_val_score = 0
    for epoch in range(1, 201):
        loss, val_loss = train_with_validation(model, data, train_edges, val_edges, optimizer, scheduler)
        loss_history.append(loss)
        val_loss_history.append(val_loss)

        if epoch % 10 == 0:
            val_metrics = evaluate_model_performance(model, data, val_edges)
            val_score = val_metrics['roc_auc']
            roc_auc_history.append(val_score)

            if val_score > best_val_score:
                best_val_score = val_score
                save_model(model, 'best_model_final.pth')

            logging.info(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val ROC-AUC: {val_score:.4f}")
            print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val ROC-AUC: {val_score:.4f}")

    print("✓ Requirement 3.3: Model Training Complete")

    # Plot training metrics
    print("\n--- Starting Visualization Generation ---")
    plot_training_metrics(loss_history, val_loss_history, roc_auc_history)
    print("✓ Requirement 4.1: Training Metrics Visualization Complete")

    # Load best model and evaluate
    print("\n--- Starting Final Model Evaluation ---")
    model = load_model(model, 'best_model_final.pth', device)
    test_metrics = evaluate_model_performance(model, data, test_edges)
    print("✓ Requirement 4.2: Model Evaluation Complete")
    logging.info(f"Final test metrics: {test_metrics}")
    print(f"Final test metrics: {test_metrics}")

    # Genre-specific performance analysis
    print("\n--- Starting Genre Analysis ---")
    genre_performance = analyze_genre_specific_performance(model, data, test_edges, nodes_df, genre_columns)
    logging.info(f"Genre-specific performance: {genre_performance}")

    # Plot genre performance
    plot_genre_performance(genre_performance)
    print("✓ Requirement 4.3: Genre Analysis Complete")

    print("\nDetailed Performance Metrics:")
    print(f"ROC-AUC Score: {test_metrics['roc_auc']:.4f}")
    print(f"Average Precision: {test_metrics['average_precision']:.4f}")
    print("\nConfusion Matrix:")
    print(test_metrics['confusion_matrix'])

    # Add genre performance summary
    print("\nGenre-wise Performance:")
    for genre, metrics in genre_performance.items():
        print(f"{genre:15} - ROC-AUC: {metrics['roc_auc']:.4f}, F1: {metrics['f1']:.4f}")

    # Predict new collaborations
    print("\n--- Starting Collaboration Prediction ---")
    print("\nTop predicted collaborations:")
    predictions = predict_new_collaborations(model, data, nodes_df, node_index_map)
    for pred in predictions:
        print(f"Artist 1: {pred['artist1']}, Artist 2: {pred['artist2']}, Score: {pred['score']:.4f}")

    print("\nTraining complete. Check the following files for visualizations:")
    print("1. training_metrics.png - Training progress visualization")
    print("2. genre_distribution.png - Distribution of genres in the dataset")
    print("3. genre_performance.png - Model performance across different genres")
    print("4. confusion_matrix.png - Confusion matrix for test predictions")
    print("5. precision_recall_curve.png - Precision-Recall curve")

if __name__ == "__main__":
    main()


=== Starting Project Requirements Tracking ===
Using device: cpu

✓ Requirement 1.1: Device Setup Complete - Using cpu

--- Starting Data Loading and Validation ---
  • Validating data structure...
Total artists: 120353
Artists with genre info: 25576
✓ Requirement 1.2: Data Loading and Validation Complete
  • Loaded 120353 artists
  • Processed 111799 collaborations
  • Extracted 15 genre features
✓ Requirement 1.3: Data Movement to Device Complete

--- Starting Data Splitting ---
✓ Requirement 2.1: Data Splitting Complete
  • Training edges: 78259
  • Validation edges: 16770
  • Test edges: 16770

--- Starting Hyperparameter Tuning ---


9it [00:56,  6.24s/it]


✓ Requirement 2.2: Hyperparameter Tuning Complete
  • Best parameters found: {'in_channels': 17, 'hidden_channels': 128, 'heads': 4}

--- Starting Model Training ---
✓ Requirement 3.1: Model Initialization Complete
✓ Requirement 3.2: Optimizer and Scheduler Setup Complete

--- Starting Training Loop ---
Epoch: 010, Loss: 0.5745, Val Loss: 0.5586, Val ROC-AUC: 0.8711
Epoch: 020, Loss: 0.5624, Val Loss: 0.5557, Val ROC-AUC: 0.8601
Epoch: 030, Loss: 0.5573, Val Loss: 0.5460, Val ROC-AUC: 0.8533
Epoch: 040, Loss: 0.5552, Val Loss: 0.5499, Val ROC-AUC: 0.8496
Epoch: 050, Loss: 0.5498, Val Loss: 0.5503, Val ROC-AUC: 0.8500
Epoch: 060, Loss: 0.5486, Val Loss: 0.5471, Val ROC-AUC: 0.8520
Epoch: 070, Loss: 0.5487, Val Loss: 0.5511, Val ROC-AUC: 0.8494
Epoch: 080, Loss: 0.5507, Val Loss: 0.5491, Val ROC-AUC: 0.8523
Epoch: 090, Loss: 0.5506, Val Loss: 0.5490, Val ROC-AUC: 0.8515
Epoch: 100, Loss: 0.5501, Val Loss: 0.5486, Val ROC-AUC: 0.8517
Epoch: 110, Loss: 0.5478, Val Loss: 0.5488, Val ROC-AUC

# Main function with tsne visualization(optional):

In [38]:
def main():
    """Enhanced main pipeline with additional visualizations."""
    print("\n=== Starting Project Requirements Tracking ===")

    # Set up device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load and validate data
    print("\n--- Loading and Validating Data ---")
    nodes_df, edge_index, node_features, node_index_map, genre_columns = load_data(validate=True)

    # Move data to device
    data = Data(
        x=node_features.to(device),
        edge_index=edge_index.to(device)
    )

    # Plot Degree Distribution
    print("\n--- Visualizing Graph Degree Distribution ---")
    plot_degree_distribution(edge_index, filename="degree_distribution.png")
    print("✓ Degree Distribution Plot Saved.")

    # Split Data (Train, Validation, Test)
    print("\n--- Splitting Data ---")
    num_edges = edge_index.size(1)
    all_edges = edge_index.t().contiguous()

    train_edges, temp_edges = train_test_split(all_edges, test_size=0.3, random_state=42)
    val_edges, test_edges = train_test_split(temp_edges, test_size=0.5, random_state=42)

    # Move edges to device
    train_edges = train_edges.to(device)
    val_edges = val_edges.to(device)
    test_edges = test_edges.to(device)

    # Initialize Model
    print("\n--- Initializing Model ---")
    model = GAT(
        in_channels=data.num_node_features,
        hidden_channels=64,
        heads=8
    ).to(device)

    # Store Initial Embeddings for Visualization
    print("\n--- Generating Initial Embedding Visualization ---")
    with torch.no_grad():
        initial_embeddings = model(data.x, data.edge_index)

    visualize_tsne_embeddings(initial_embeddings, nodes_df, genre_columns, filename="tsne_initial.png")
    visualize_umap_embeddings(initial_embeddings, nodes_df, genre_columns, filename="umap_initial.png")
    print("✓ Initial t-SNE and UMAP Visualizations Saved.")

    # Set up optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

    # Train Model
    print("\n--- Training Model ---")
    loss_history = []
    val_loss_history = []
    roc_auc_history = []

    best_val_score = 0
    for epoch in range(1, 201):
        loss, val_loss = train_with_validation(model, data, train_edges, val_edges, optimizer, scheduler)
        loss_history.append(loss)
        val_loss_history.append(val_loss)

        if epoch % 10 == 0:
            val_metrics = evaluate_model_performance(model, data, val_edges)
            val_score = val_metrics['roc_auc']
            roc_auc_history.append(val_score)

            if val_score > best_val_score:
                best_val_score = val_score
                save_model(model, 'best_model_final.pth')

            print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val ROC-AUC: {val_score:.4f}")

    print("✓ Training Completed.")

    # Plot Training Metrics
    print("\n--- Generating Training Metrics Visualization ---")
    plot_training_metrics(loss_history, val_loss_history, roc_auc_history)
    print("✓ Training Metrics Visualization Saved.")

    # Load Best Model & Evaluate
    print("\n--- Evaluating Final Model ---")
    model = load_model(model, 'best_model_final.pth', device)
    test_metrics = evaluate_model_performance(model, data, test_edges)
    print("✓ Model Evaluation Completed.")

    # Generate Final Embeddings & Visualizations
    print("\n--- Generating Final Embedding Visualization ---")
    with torch.no_grad():
        trained_embeddings = model(data.x, data.edge_index)

    visualize_tsne_embeddings(trained_embeddings, nodes_df, genre_columns, filename="tsne_trained.png")
    visualize_umap_embeddings(trained_embeddings, nodes_df, genre_columns, filename="umap_trained.png")
    print("✓ Final t-SNE and UMAP Visualizations Saved.")

    # Analyze Genre-Specific Performance
    print("\n--- Analyzing Genre-Specific Performance ---")
    genre_performance = analyze_genre_specific_performance(model, data, test_edges, nodes_df, genre_columns)
    plot_genre_performance(genre_performance)
    print("✓ Genre Analysis Completed.")

    # Predict New Collaborations
    print("\n--- Predicting New Collaborations ---")
    predictions = predict_new_collaborations(model, data, nodes_df, node_index_map)

    for pred in predictions[:10]:  # Show only top 10
        print(f"Artist 1: {pred['artist1']}, Artist 2: {pred['artist2']}, Score: {pred['score']:.4f}")

    print("\nTraining complete. Check the following files for visualizations:")
    print("1. training_metrics.png - Training progress visualization")
    print("2. degree_distribution.png - Node degree distribution")
    print("3. genre_distribution.png - Distribution of genres in the dataset")
    print("4. genre_performance.png - Model performance across different genres")
    print("5. confusion_matrix.png - Confusion matrix for test predictions")
    print("6. precision_recall_curve.png - Precision-Recall curve")
    print("7. tsne_initial.png - Initial t-SNE of node embeddings")
    print("8. tsne_trained.png - Final t-SNE of node embeddings")
    print("9. umap_initial.png - Initial UMAP of node embeddings")
    print("10. umap_trained.png - Final UMAP of node embeddings")

if __name__ == "__main__":
    main()


=== Starting Project Requirements Tracking ===
Using device: cpu

--- Loading and Validating Data ---
  • Validating data structure...
Total artists: 120353
Artists with genre info: 25576

--- Visualizing Graph Degree Distribution ---
✓ Degree Distribution Plot Saved.

--- Splitting Data ---

--- Initializing Model ---

--- Generating Initial Embedding Visualization ---
✓ Initial t-SNE and UMAP Visualizations Saved.

--- Training Model ---
Epoch: 010, Loss: 0.5923, Val Loss: 0.5426, Val ROC-AUC: 0.8692
Epoch: 020, Loss: 0.5741, Val Loss: 0.5408, Val ROC-AUC: 0.8809
Epoch: 030, Loss: 0.5643, Val Loss: 0.5516, Val ROC-AUC: 0.8684
Epoch: 040, Loss: 0.5568, Val Loss: 0.5405, Val ROC-AUC: 0.8669
Epoch: 050, Loss: 0.5635, Val Loss: 0.5422, Val ROC-AUC: 0.8653
Epoch: 060, Loss: 0.5589, Val Loss: 0.5438, Val ROC-AUC: 0.8622
Epoch: 070, Loss: 0.5581, Val Loss: 0.5436, Val ROC-AUC: 0.8613
Epoch: 080, Loss: 0.5632, Val Loss: 0.5431, Val ROC-AUC: 0.8654
Epoch: 090, Loss: 0.5628, Val Loss: 0.5432,