In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
import random

#  Spotify ID, Popularity, Followers, Genre columns
nodes_df = pd.read_csv("nodes_cleaned.csv") 
#  Spotify ID 1, Spotify ID 2
edges_df = pd.read_csv("edges_cleaned.csv") 


In [3]:
#We are just resetting the index and will raise an value error if we found duplicate ids
if nodes_df['spotify_id'].duplicated().any():
    nodes_df = nodes_df.drop_duplicates(subset=['spotify_id']).reset_index(drop=True)
if nodes_df['spotify_id'].duplicated().any():
    raise ValueError("Duplicate Spotify IDs found in nodes_df.")

In [None]:

# Defining genre columns Total 14 attributes including unknown
genre_columns = [
    'alternative Indie', 'classical_orchestral', 'electronic', 'folk world',
    'hazz', 'hip_hop', 'latin', 'metal', 'pop', 'randb_Soul',
    'reggae_dancehall', 'rock', 'soundtrack', 'unknown'
]
# Mapping Spotify IDs to indices for graph construction
node_index_map = {spotify_id: idx for idx, spotify_id in enumerate(nodes_df['spotify_id'])}
edges_df['Source'] = edges_df['id_0'].map(node_index_map)
edges_df['Target'] = edges_df['id_1'].map(node_index_map)


# Creating edge index (two rows: source and target nodes)
edge_index = torch.tensor(edges_df[['Source', 'Target']].to_numpy().T, dtype=torch.long)

# Creating node features
node_features = torch.tensor(
    nodes_df[['popularity', 'followers'] + genre_columns].to_numpy(),
    dtype=torch.float
)



In [None]:

# function to generate negative edges
def generate_negative_edges(num_nodes, existing_edges, num_samples):
    """
    Generate negative edges by sampling random node pairs that are not connected.
    """
    existing_edges_set = set(map(tuple, existing_edges.T.tolist()))  # Convert to set for fast lookup
    negative_edges = set()

    while len(negative_edges) < num_samples:
        i, j = np.random.randint(0, num_nodes, size=2)
        if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
            negative_edges.add((i, j))

    return torch.tensor(list(negative_edges), dtype=torch.long)

# Generating negative edges
num_negative_samples = len(edges_df)
negative_edges = generate_negative_edges(len(nodes_df), edge_index, num_negative_samples)



In [None]:
# Combining positive and negative edges for training
positive_edges = edge_index.T
all_edges = torch.cat([positive_edges, negative_edges], dim=0)
labels = torch.cat([torch.ones(len(positive_edges)), torch.zeros(len(negative_edges))])

# Train-test split for edges
train_edges, test_edges, train_labels, test_labels = train_test_split(
    all_edges, labels, test_size=0.2, random_state=42
)

# Converting to PyTorch Geometric Data
data = Data(x=node_features, edge_index=edge_index)



In [None]:
# Defining GraphSAGE Model for Link Prediction
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        # Combining two embeddings
        self.edge_predictor = torch.nn.Linear(hidden_dim * 2, 1) 

    def forward(self, data):
        x = F.relu(self.conv1(data.x, data.edge_index))
        x = F.relu(self.conv2(x, data.edge_index))
        return x

    def predict_edges(self, x, edges):
        # Combine node embeddings for edge prediction
        edge_embeds = torch.cat([x[edges[:, 0]], x[edges[:, 1]]], dim=1)
        return torch.sigmoid(self.edge_predictor(edge_embeds)).squeeze()

# Initializing model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(data.num_node_features, hidden_dim=32).to(device)
data = data.to(device)
train_edges, train_labels = train_edges.to(device), train_labels.to(device)
test_edges, test_labels = test_edges.to(device), test_labels.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    embeddings = model(data)
    pred = model.predict_edges(embeddings, train_edges)
    loss = F.binary_cross_entropy(pred, train_labels.float())
    loss.backward()
    optimizer.step()
    return loss.item()



In [8]:
def test():
    model.eval()
    with torch.no_grad():
        embeddings = model(data)
        pred = model.predict_edges(embeddings, test_edges)
        pred_labels = (pred > 0.5).float()
        accuracy = (pred_labels == test_labels.float()).sum() / len(test_labels)
    return accuracy.item()



In [9]:
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')


Epoch 010, Loss: 0.4056, Test Accuracy: 0.8508
Epoch 020, Loss: 0.2928, Test Accuracy: 0.8919
Epoch 030, Loss: 0.2444, Test Accuracy: 0.9144
Epoch 040, Loss: 0.2219, Test Accuracy: 0.9221
Epoch 050, Loss: 0.2124, Test Accuracy: 0.9263
Epoch 060, Loss: 0.2068, Test Accuracy: 0.9281
Epoch 070, Loss: 0.2030, Test Accuracy: 0.9303
Epoch 080, Loss: 0.2002, Test Accuracy: 0.9322
Epoch 090, Loss: 0.1978, Test Accuracy: 0.9334
Epoch 100, Loss: 0.1956, Test Accuracy: 0.9342
Epoch 110, Loss: 0.1986, Test Accuracy: 0.9353
Epoch 120, Loss: 0.1924, Test Accuracy: 0.9364
Epoch 130, Loss: 0.1918, Test Accuracy: 0.9366
Epoch 140, Loss: 0.1903, Test Accuracy: 0.9363
Epoch 150, Loss: 0.1891, Test Accuracy: 0.9368
Epoch 160, Loss: 0.1881, Test Accuracy: 0.9377
Epoch 170, Loss: 0.1872, Test Accuracy: 0.9382
Epoch 180, Loss: 0.1863, Test Accuracy: 0.9386
Epoch 190, Loss: 0.1855, Test Accuracy: 0.9390
Epoch 200, Loss: 0.1848, Test Accuracy: 0.9396


In [None]:
import random

# Converting existing edges to a set for fast lookup
existing_edges_set = set(map(tuple, edge_index.T.tolist())) 

# Number of unseen pairs to sample
num_samples = 100
unseen_pairs = []

# Randomly sampling node pairs and check if they're not connected
while len(unseen_pairs) < num_samples:
    i, j = random.sample(range(len(nodes_df)), 2)
    
    # Ensuring the pair is not already an existing edge
    if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
        unseen_pairs.append((i, j))

# Converting to tensor
unseen_pairs = torch.tensor(unseen_pairs, dtype=torch.long).to(device)


In [None]:
# Lets evaluate the model
model.eval()
with torch.no_grad():
    embeddings = model(data)
    # predicting for unseen pairs
    predictions = model.predict_edges(embeddings, unseen_pairs) 


In [None]:
# Ranking unseen pairs by predicted score
sorted_indices = torch.argsort(predictions, descending=True)
top_predictions = unseen_pairs[sorted_indices]
top_scores = predictions[sorted_indices]

# Converting back to artist IDs for better interpretability
top_collaborations = [
    (nodes_df.iloc[i]['spotify_id'], nodes_df.iloc[j]['spotify_id'], score.item())
    for (i, j), score in zip(top_predictions.tolist(), top_scores)
]

# Displaying top 10 predicted collaborations with artist names for now
for pair in top_collaborations[:10]:
    spotify_id_1, spotify_id_2 = pair[0], pair[1]
    
    # Searching for the Spotify ID in the nodes_df and print the respective artist names
    artist_1_name = nodes_df[nodes_df['spotify_id'] == spotify_id_1]['name'].values[0]
    artist_2_name = nodes_df[nodes_df['spotify_id'] == spotify_id_2]['name'].values[0]
    
    print(f"Artist 1: {artist_1_name}, Artist 2: {artist_2_name}, Predicted Score: {pair[2]:.4f}")

Artist 1: Arsh Sandhu, Artist 2: Petit Biscuit, Predicted Score: 0.9785
Artist 1: prodbypengg, Artist 2: SHAED, Predicted Score: 0.9457
Artist 1: Jheo Chavoso, Artist 2: PATAY, Predicted Score: 0.9337
Artist 1: Bendik HK, Artist 2: Mnogoznaal, Predicted Score: 0.7793
Artist 1: Start, Artist 2: Wolf, Predicted Score: 0.7744
Artist 1: 5ALVO, Artist 2: Tobee, Predicted Score: 0.7499
Artist 1: Alfred Hui, Artist 2: Billian LLD, Predicted Score: 0.7434
Artist 1: A Si, Artist 2: LO'99, Predicted Score: 0.7151
Artist 1: 031CHOPPA, Artist 2: Diamante 0.1, Predicted Score: 0.6917
Artist 1: Masta Killa, Artist 2: Dawn McCarthy, Predicted Score: 0.6311
