In [1]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
import random

# Including columns: Spotify ID, Popularity, Followers, Genre columns
nodes_df = pd.read_csv("nodes_cleaned.csv")
# Including columns: Spotify ID 1, Spotify ID 2  
edges_df = pd.read_csv("edges_cleaned.csv")


  nodes_df = pd.read_csv("nodes_cleaned.csv")


In [2]:
nodes_df.dtypes


spotify_id               object
name                     object
followers                object
popularity              float64
alternative Indie       float64
classical_orchestral      int64
electronic                int64
folk world                int64
jazz                      int64
hip_hop                   int64
latin                     int64
metal                     int64
pop                       int64
randb_Soul                int64
reggae_dancehall          int64
rock                      int64
soundtrack                int64
unknown                   int64
Unnamed: 18             float64
dtype: object

In [3]:
edges_df.dtypes


id_0        object
id_1        object
artist_0    object
artist_1    object
dtype: object

In [4]:
# Converting 'followers' to numeric, setting invalid entries to NaN
nodes_df['followers'] = pd.to_numeric(nodes_df['followers'], errors='coerce')

# Droping unnecessary column
if 'Unnamed: 18' in nodes_df.columns:
    nodes_df.drop(columns=['Unnamed: 18'], inplace=True)

# Verify the types
print(nodes_df.dtypes)


spotify_id               object
name                     object
followers               float64
popularity              float64
alternative Indie       float64
classical_orchestral      int64
electronic                int64
folk world                int64
jazz                      int64
hip_hop                   int64
latin                     int64
metal                     int64
pop                       int64
randb_Soul                int64
reggae_dancehall          int64
rock                      int64
soundtrack                int64
unknown                   int64
dtype: object


In [5]:
# Lets drop the dupliates and also raise an exception if found.
if nodes_df['spotify_id'].duplicated().any():
    nodes_df = nodes_df.drop_duplicates(subset=['spotify_id']).reset_index(drop=True)
if nodes_df['spotify_id'].duplicated().any():
    raise ValueError("Duplicate Spotify IDs found in nodes_df.")



In [6]:

# Defining genre columns
genre_columns = [
    'alternative Indie', 'classical_orchestral', 'electronic', 'folk world',
    'jazz', 'hip_hop', 'latin', 'metal', 'pop', 'randb_Soul',
    'reggae_dancehall', 'rock', 'soundtrack', 'unknown'
]
# Mapping Spotify IDs to indices for graph construction
node_index_map = {spotify_id: idx for idx, spotify_id in enumerate(nodes_df['spotify_id'])}
edges_df['Source'] = edges_df['id_0'].map(node_index_map)
edges_df['Target'] = edges_df['id_1'].map(node_index_map)


# Creating edge index (two rows: source and target nodes)
edge_index = torch.tensor(edges_df[['Source', 'Target']].to_numpy().T, dtype=torch.long)

# Creating node features
node_features = torch.tensor(
    nodes_df[['popularity', 'followers'] + genre_columns].to_numpy(),
    dtype=torch.float
)



In [13]:

# function to generate negative edges
def generate_negative_edges(num_nodes, existing_edges, num_samples):
    """
    Generate negative edges by sampling random node pairs that are not connected.
    """
    existing_edges_set = set(map(tuple, existing_edges.T.tolist())) 
    negative_edges = set()

    while len(negative_edges) < num_samples:
        i, j = np.random.randint(0, num_nodes, size=2)
        # Avoid self-loops and duplicates
        if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
            negative_edges.add((i, j))
     # Transpose for edge_index format
    return torch.tensor(list(negative_edges), dtype=torch.long).T 



In [8]:
# Combining positive and negative edges for training
positive_edges = edge_index

# Combine edges
all_edges = torch.cat([positive_edges, negative_edges], dim=1)  # Combine along columns
labels = torch.cat([torch.ones(positive_edges.size(1)), torch.zeros(negative_edges.size(1))])

# Train-test split for edges
train_edges, test_edges, train_labels, test_labels = train_test_split(
    all_edges, labels, test_size=0.2, random_state=42
)

# Converting to PyTorch Geometric Data
data = Data(x=node_features, edge_index=edge_index)



In [9]:
# Defining GraphSAGE Model for Link Prediction
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        # Combining two embeddings for predictor
        self.edge_predictor = torch.nn.Linear(hidden_dim * 2, 1) 

    def forward(self, data):
        x = F.relu(self.conv1(data.x, data.edge_index))
        x = F.relu(self.conv2(x, data.edge_index))
        return x

    def predict_edges(self, x, edges):
        # Combine node embeddings for edge prediction
        edge_embeds = torch.cat([x[edges[:, 0]], x[edges[:, 1]]], dim=1)
        return torch.sigmoid(self.edge_predictor(edge_embeds)).squeeze()

# Initializing model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(data.num_node_features, hidden_dim=32).to(device)
data = data.to(device)
train_edges, train_labels = train_edges.to(device), train_labels.to(device)
test_edges, test_labels = test_edges.to(device), test_labels.to(device)
# LEarning rate is set to 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    embeddings = model(data)
    pred = model.predict_edges(embeddings, train_edges)
    loss = F.binary_cross_entropy(pred, train_labels.float())
    loss.backward()
    optimizer.step()
    return loss.item()



In [12]:
print(edge_index)
print(f"Min index: {edge_index.min()}, Max index: {edge_index.max()}")


tensor([[ 46798,   8067, 129901,  ..., 106538,   8513,  11159],
        [  1753,  17604,   5533,  ...,   2789,  30938,  24254]])
Min index: -9223372036854775808, Max index: 143759


In [10]:
def test():
    model.eval()
    with torch.no_grad():
        embeddings = model(data)
        pred = model.predict_edges(embeddings, test_edges)
        pred_labels = (pred > 0.5).float()
        accuracy = (pred_labels == test_labels.float()).sum() / len(test_labels)
    return accuracy.item()



In [11]:
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')


IndexError: Found negative indices in 'edge_index' (got -9223372036854775808). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 143760) in your node feature matrix and try again.

In [None]:
import random

# Converting existing edges to a set for fast lookup
existing_edges_set = set(map(tuple, edge_index.T.tolist())) 

# Number of unseen pairs of 100 for samples
num_samples = 100
unseen_pairs = []

# Randomly sampling node pairs and check if they're not connected
while len(unseen_pairs) < num_samples:
    # Randomly sample two nodes
    i, j = random.sample(range(len(nodes_df)), 2) 
    
    # Ensuring the pair is not already an existing edge
    if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
        unseen_pairs.append((i, j))

# Converting to tensor
unseen_pairs = torch.tensor(unseen_pairs, dtype=torch.long).to(device)


In [None]:
# lets do the model evaluation
model.eval()  
with torch.no_grad():
    #useing the  data
    embeddings = model(data)
    predictions = model.predict_edges(embeddings, unseen_pairs)

In [None]:
# Ranking unseen pairs by predicted score
sorted_indices = torch.argsort(predictions, descending=True)
top_predictions = unseen_pairs[sorted_indices]
top_scores = predictions[sorted_indices]

# Converting back to artist IDs for better interpretability
top_collaborations = [
    (nodes_df.iloc[i]['spotify_id'], nodes_df.iloc[j]['spotify_id'], score.item())
    for (i, j), score in zip(top_predictions.tolist(), top_scores)
]

# Displaying top 10 predicted collaborations with artist names

for pair in top_collaborations[:10]:
    spotify_id_1, spotify_id_2 = pair[0], pair[1]
    
    # Searching for the Spotify ID in the nodes_df and print the respective artist names
    artist_1_name = nodes_df[nodes_df['spotify_id'] == spotify_id_1]['name'].values[0]
    artist_2_name = nodes_df[nodes_df['spotify_id'] == spotify_id_2]['name'].values[0]
    
    print(f"Artist 1: {artist_1_name}, Artist 2: {artist_2_name}, Predicted Score: {pair[2]:.4f}")