In [9]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
import random

# Loading cleaned nodes and edges data
#columns: Spotify ID, Popularity, Followers, Genre columns
nodes_df = pd.read_csv("nodes_cleaned.csv") 
# Columns: Spotify ID 1, Spotify ID 2
edges_df = pd.read_csv("edges_cleaned.csv")  


  nodes_df = pd.read_csv("nodes_cleaned.csv")


In [10]:
nodes_df.dtypes


spotify_id               object
name                     object
followers                object
popularity              float64
alternative Indie       float64
classical_orchestral      int64
electronic                int64
folk world                int64
jazz                      int64
hip_hop                   int64
latin                     int64
metal                     int64
pop                       int64
randb_Soul                int64
reggae_dancehall          int64
rock                      int64
soundtrack                int64
unknown                   int64
Unnamed: 18             float64
dtype: object

In [11]:
# Convert 'followers' column to integer
nodes_df['followers'] = pd.to_numeric(nodes_df['followers'], errors='coerce').fillna(0).astype(int)
nodes_df['alternative Indie'] = pd.to_numeric(nodes_df['alternative Indie'], errors='coerce').fillna(0).astype(int)

# Drop the 'Unnamed: 18' column
nodes_df = nodes_df.drop(columns=['Unnamed: 18'], errors='ignore')

print(nodes_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143848 entries, 0 to 143847
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   spotify_id            143848 non-null  object 
 1   name                  143848 non-null  object 
 2   followers             143848 non-null  int64  
 3   popularity            143848 non-null  float64
 4   alternative Indie     143848 non-null  int64  
 5   classical_orchestral  143848 non-null  int64  
 6   electronic            143848 non-null  int64  
 7   folk world            143848 non-null  int64  
 8   jazz                  143848 non-null  int64  
 9   hip_hop               143848 non-null  int64  
 10  latin                 143848 non-null  int64  
 11  metal                 143848 non-null  int64  
 12  pop                   143848 non-null  int64  
 13  randb_Soul            143848 non-null  int64  
 14  reggae_dancehall      143848 non-null  int64  
 15  

In [12]:
if nodes_df['spotify_id'].duplicated().any():
    nodes_df = nodes_df.drop_duplicates(subset=['spotify_id']).reset_index(drop=True)
if nodes_df['spotify_id'].duplicated().any():
    raise ValueError("Duplicate Spotify IDs found in nodes_df.")



In [16]:

# Defining genre columns
genre_columns = [
    'alternative Indie', 'classical_orchestral', 'electronic', 'folk world',
    'jazz', 'hip_hop', 'latin', 'metal', 'pop', 'randb_Soul',
    'reggae_dancehall', 'rock', 'soundtrack', 'unknown'
]
# Mapping Spotify IDs to indices for graph construction
node_index_map = {spotify_id: idx for idx, spotify_id in enumerate(nodes_df['spotify_id'])}
edges_df['Source'] = edges_df['id_0'].map(node_index_map)
edges_df['Target'] = edges_df['id_1'].map(node_index_map)


# Creating edge index (two rows: source and target nodes)
edge_index = torch.tensor(edges_df[['Source', 'Target']].to_numpy().T, dtype=torch.long)

# Creating node features
node_features = torch.tensor(
    nodes_df[['popularity', 'followers'] + genre_columns].to_numpy(),
    dtype=torch.float
)



In [23]:
# Check if any invalid indices exist
def check_invalid_indices(edge_index, num_nodes):
    invalid_indices = (edge_index < 0) | (edge_index >= num_nodes)
    if invalid_indices.any():
        print(f"Invalid indices found: {edge_index[invalid_indices]}")
    else:
        print("No invalid indices found.")

# print for invalid indices in positive and negative edges
check_invalid_indices(positive_edges, len(nodes_df))

# perform  positive edges
positive_edges = edge_index
print(f"Positive edges min: {positive_edges.min()} | max: {positive_edges.max()}")



def generate_negative_edges(num_nodes, existing_edges, num_samples):
    """
    Generate negative edges by sampling random node pairs that are not connected.
    """
    # Convert existing edges to a set for fast lookup
    existing_edges_set = set(map(tuple, existing_edges.T.cpu().numpy())) 
    negative_edges = set()

    while len(negative_edges) < num_samples:
        i, j = np.random.randint(0, num_nodes, size=2)
        # Ensure the edge is valid and not in the existing set
        if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
            if 0 <= i < num_nodes and 0 <= j < num_nodes:
                negative_edges.add((i, j))

    negative_edges_tensor = torch.tensor(list(negative_edges), dtype=torch.long).T
    check_invalid_indices(negative_edges_tensor, num_nodes)

    return negative_edges_tensor

# Generating negative edges
num_negative_samples = len(edges_df)
negative_edges = generate_negative_edges(len(nodes_df), edge_index, num_negative_samples)

check_invalid_indices(negative_edges, len(nodes_df))

# perform negative edges
negative_edges = generate_negative_edges(len(nodes_df), edge_index, num_negative_samples)
print(f"Negative edges min: {negative_edges.min()} | max: {negative_edges.max()}")



Invalid indices found: tensor([-9223372036854775808, -9223372036854775808, -9223372036854775808,
        -9223372036854775808, -9223372036854775808, -9223372036854775808])
Positive edges min: -9223372036854775808 | max: 143759
No invalid indices found.
No invalid indices found.
No invalid indices found.
Negative edges min: 0 | max: 143759


In [24]:
print(f"Positive edges min is {edge_index.min()} and max is {edge_index.max()}") 
print(f"Negative edges min is {negative_edges.min()} and maximum is {negative_edges.max()}")

Positive edges min is -9223372036854775808 and max is 143759
Negative edges min is 0 and maximum is 143759


In [25]:
# Check the initial edge_index for invalid indices
def check_edge_index_validity(edge_index, num_nodes):
    invalid_indices = (edge_index < 0) | (edge_index >= num_nodes)
    if invalid_indices.any():
        print(f"Invalid indices found in edge_index: {edge_index[invalid_indices]}")
        edge_index = edge_index[:, ~invalid_indices.any(dim=0)]  # Remove invalid edges
        print(f"Edge index after cleaning: {edge_index}")
    else:
        print("Edge index is valid.")
    return edge_index

# Run this check on your original edge_index
edge_index = check_edge_index_validity(edge_index, len(nodes_df))


Invalid indices found in edge_index: tensor([-9223372036854775808, -9223372036854775808, -9223372036854775808,
        -9223372036854775808, -9223372036854775808, -9223372036854775808])
Edge index after cleaning: tensor([[ 46798,   8067, 129901,  ..., 106538,   8513,  11159],
        [  1753,  17604,   5533,  ...,   2789,  30938,  24254]])


In [26]:
# Combining positive and negative edges for training
positive_edges = edge_index  # No need to transpose
# Concatenate along columns (dim=1)
all_edges = torch.cat([positive_edges, negative_edges], dim=1) 
labels = torch.cat([torch.ones(positive_edges.shape[1]), torch.zeros(negative_edges.shape[1])])

# Train-test split for edges
train_edges, test_edges, train_labels, test_labels = train_test_split(
    all_edges.T.numpy(), labels.numpy(), test_size=0.2, random_state=42
)

# Convert train and test edges to PyTorch tensors
train_edges = torch.tensor(train_edges, dtype=torch.long).T
test_edges = torch.tensor(test_edges, dtype=torch.long).T

# Convert train and test labels to PyTorch tensors
train_labels = torch.tensor(train_labels, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.float)

# Move the data to the correct device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_edges = train_edges.to(device)
train_labels = train_labels.to(device)
test_edges = test_edges.to(device)
test_labels = test_labels.to(device)

# Convert to PyTorch Geometric Data
data = Data(x=node_features, edge_index=all_edges.to(device))

# Use test_data for testing (Data object with test edges)
test_data = Data(x=node_features, edge_index=test_edges).to(device)

# Check the shapes of train and test edges
print("Train edges shape:", train_edges.shape)
print("Test edges shape:", test_edges.shape)
print("Train labels shape:", train_labels.shape)
print("Test labels shape:", test_labels.shape)


Train edges shape: torch.Size([2, 425668])
Test edges shape: torch.Size([2, 106418])
Train labels shape: torch.Size([425668])
Test labels shape: torch.Size([106418])


In [27]:
# Defining GraphSAGE Model for Link Prediction
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.edge_predictor = torch.nn.Linear(hidden_dim * 2, 1)

    def forward(self, data):
        x = F.relu(self.conv1(data.x, data.edge_index))
        x = F.relu(self.conv2(x, data.edge_index))
        return x

    def predict_edges(self, x, edges):
        # Combine node embeddings for edge prediction
        edge_embeds = torch.cat([x[edges[0]], x[edges[1]]], dim=1)
        # Ensure the output has the shape [num_edges] and flatten to match num_edges
        return torch.sigmoid(self.edge_predictor(edge_embeds)).view(-1)  


# Initializing model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(data.num_node_features, hidden_dim=32).to(device)
data = data.to(device)
train_edges, train_labels = train_edges.to(device), train_labels.to(device)
test_edges, test_labels = test_edges.to(device), test_labels.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    embeddings = model(data)
    # validating train_edges is a tensor
    print(f"train_edges shape: {train_edges.shape}")
    # Predictions for edges
    pred = model.predict_edges(embeddings, train_edges)
    print(f"pred shape: {pred.shape}") 
    # Compute binary cross entropy loss
    loss = F.binary_cross_entropy(pred, train_labels.float())
    loss.backward()
    optimizer.step()
    return loss.item()


In [28]:
print(f"train_labels shape before training: {train_labels.shape}")

train_labels shape before training: torch.Size([425668])


In [29]:
print("Positive edges shape:", positive_edges.shape)
print("Negative edges shape:", negative_edges.shape)
print("All edges shape (before split):", all_edges.shape)
print("Train edges shape:", train_edges.shape)
print("Test edges shape:", test_edges.shape)
print("Data edge_index shape:", data.edge_index.shape)


Positive edges shape: torch.Size([2, 266040])
Negative edges shape: torch.Size([2, 266046])
All edges shape (before split): torch.Size([2, 532086])
Train edges shape: torch.Size([2, 425668])
Test edges shape: torch.Size([2, 106418])
Data edge_index shape: torch.Size([2, 532086])


In [30]:
print("All edge indices:", all_edges.min())
print("Positive edge indices:", positive_edges.min())
print("Negative edge indices:", negative_edges.min())


All edge indices: tensor(0)
Positive edge indices: tensor(0)
Negative edge indices: tensor(0)


In [31]:
def test():
    model.eval()
    with torch.no_grad():
        embeddings = model(data)
        pred = model.predict_edges(embeddings, test_edges)
        pred_labels = (pred > 0.5).float()
        accuracy = (pred_labels == test_labels.float()).sum() / len(test_labels)
    return accuracy.item()



In [32]:
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')


train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
Epoch 010, Loss: 0.4622, Test Accuracy: 0.7906
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668])
pred shape: torch.Size([425668])
train_edges shape: torch.Size([2, 425668]

In [51]:
import random

# Converting existing edges to a set for fast lookup
existing_edges_set = set(map(tuple, edge_index.T.tolist()))  # Set of existing edges for quick lookup

# Number of unseen pairs to sample
num_samples = 1000
unseen_pairs = []

# Randomly sampling node pairs and check if they're not connected
# Randomly sample two nodes
while len(unseen_pairs) < num_samples:
    i, j = random.sample(range(len(nodes_df)), 2)  
    
    # Ensuring the pair is not already an existing edge
    if i != j and (i, j) not in existing_edges_set and (j, i) not in existing_edges_set:
        unseen_pairs.append((i, j))

# Converting to tensor
unseen_pairs = torch.tensor(unseen_pairs, dtype=torch.long).to(device)


In [52]:
# Seting model to evaluation mode
model.eval()  
with torch.no_grad():
    embeddings = model(data)
    predictions = model.predict_edges(embeddings, unseen_pairs)


In [64]:
# Ranking unseen pairs by predicted score
sorted_indices = torch.argsort(predictions, descending=True)
top_predictions = unseen_pairs[sorted_indices]
top_scores = predictions[sorted_indices]

# Converting back to artist IDs for better interpretability
top_collaborations = [
    (nodes_df.loc[i, 'spotify_id'], nodes_df.loc[j, 'spotify_id'], score.item())
    for (i, j), score in zip(top_predictions.tolist(), top_scores)
]

# Displaying top 5 predicted collaborations with artist names

for pair in top_collaborations[:5]:
    spotify_id_1, spotify_id_2 = pair[0], pair[1]
    
    # Searching for the Spotify ID in the nodes_df and print the respective artist names
    artist_1_name = nodes_df[nodes_df['spotify_id'] == spotify_id_1]['name'].values[0]
    artist_2_name = nodes_df[nodes_df['spotify_id'] == spotify_id_2]['name'].values[0]
    
    print(f" Artist 1: {artist_1_name} and  Artist 2: {artist_2_name}, Colloberation Score: {pair[2]:.4f}")

 Artist 1: Atlaxsys and  Artist 2: M-A, colloberation Score: 0.0635
 Artist 1: Boy Warrior and  Artist 2: Meyer, colloberation Score: 0.0204


In [54]:
print(top_scores)

tensor([0.0635, 0.0204])


In [None]:
# Robestness check.
node to vec embedding graph as mentioned in the assignment 3.
-------------------------------------
Existing colloberations prediction.
Also the false positive colloberation.
------------------------------------
validate the prediction in spotify.