# Combining Learnings

Through testing with strucutral-based link prediction and community-based link prediction, we've seen that an optimal combination of both of these techniques should yield a more optimal model. Here, we will implement this:

For this model testing we will only work with 1000 artists from era 5 with successful API calls. 


In [3]:
# Note: Imports

import pandas as pd
import networkx as nx
import numpy as np

artists = pd.read_parquet('../data/final_data_processed/discogs_artists.parquet')
edges = pd.read_parquet('../data/final_data_processed/discogs_edges.parquet') # Note: These will be our nodes

edges.shape, artists.shape

((5673764, 3), (1045947, 1))

In [5]:
# Note: First we will sort the edges chronologically
edges_sorted = edges.sort_values(by='release_year')
edges_sorted[:100]

era5_edges = edges_sorted[(edges_sorted['release_year'] >= 2000) & (edges_sorted['release_year'] <= 2025)]

era5_artists = pd.DataFrame({'discogs_artist_id': pd.unique(pd.concat([era5_edges.source_id, era5_edges.target_id]))})
print('Era 5')
print(len(era5_edges), len(era5_artists))
print(era5_artists.head())


Era 5
2338167 620672
  discogs_artist_id
0            115466
1            833554
2              1768
3             65718
4            882018


In [9]:
# sample for 1000 nodes that have API Call Hits 

# ========== Get Artist Names ==========
import xml.etree.ElementTree as ET

df_artistsID = pd.read_parquet("../data/final_data_processed/discogs_artists.parquet")

# print(df.head())

# read the artists xml file so we can use the artist IDs from the parquet file to get artist names
rows = []
for event, elem in ET.iterparse("../data_raw/discogs_20251101_artists.xml", events=("end",)):
    if elem.tag == "artist":   # repeating entry tag for Discogs artists
        row = {child.tag: child.text for child in elem}

        rows.append(row)

        # free memory
        elem.clear()
        parent = elem.getparent() if hasattr(elem, "getparent") else None
        if parent is not None:
            while parent.getprevious() is not None:
                del parent[0]

df_artistsNames = pd.DataFrame(rows)
print(df_artistsNames.head(-5))



# ========== SPOTIFY API FUNCTIONS ==========
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()  # loads .env into environment

client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

auth_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

sp = spotipy.Spotify(auth_manager=auth_manager)

# testing to see if the API call works
artist = sp.artist("1uNFoZAHBGtllmzznpCI3s")  # Justin Bieber ID
print(artist["name"])
# method 
def get_artist(artist, market):
    results = sp.search(q=f"artist:{artist}", type="artist", limit=1, market=market)
    items = results.get("artists", {}).get("items", [])

    
    return items[0] if items else None


def get_artist_data(artist_name, market):
    artist = get_artist(artist_name, market)
    if artist:
        return artist.get("genres", [])
    else:
        return None
    



# ========== GET 1000 ARTISTS WITH SUCCESSFUL SPOTIFY API CALLS ==========
artist_genres = {}

for index, row in era5_artists.iterrows():
    artist_id = row['discogs_artist_id']
    artist_name_row = df_artistsNames[df_artistsNames['id'] == str(artist_id)]
    if not artist_name_row.empty:
        artist_name = artist_name_row.iloc[0]['name']
        genres = get_artist_data(artist_name, market="US")
        if genres is not None:
            artist_genres[artist_id] = {
                'name': artist_name,
                'genres': genres
            }
    if len(artist_genres) >= 1000:
        break

print(f"Successfully retrieved genres for {len(artist_genres)} artists.")
print(list(artist_genres.items())[:5])  # Print first 5 entries




               id                    name         realname  \
0               1           The Persuader  Jesper Dahlbäck   
1               2  Mr. James Barth & A.D.              NaN   
2               3               Josh Wink   Josh Winkelman   
3               4           Johannes Heil    Johannes Heil   
4               5              Heiko Laux       Heiko Laux   
...           ...                     ...              ...   
9798342  16856335             Izumi Kohki              NaN   
9798343  16856338                  Kate08        Kate Webb   
9798344  16856341   The Evil B-Side Twins              NaN   
9798345  16856347             Carol Lundy              NaN   
9798346  16856350            문선 (Moonsun)              NaN   

                                                   profile  \
0        Electronic artist working out of Stockholm, ac...   
1                                                      NaN   
2        Electronic music DJ, label owner, producer, an...   
3      

In [25]:
# Get all edges where both source and target artists are in artist_genres
filtered_edges = era5_edges[
    era5_edges['source_id'].isin(artist_genres.keys()) &
    era5_edges['target_id'].isin(artist_genres.keys())
]

print(f"Filtered edges count: {len(filtered_edges)}")
print(filtered_edges.head())

# Create the graph with the filtered edges and nodes
# add nodes
G = nx.Graph()

for artist_id in artist_genres.keys():
    G.add_node(artist_id, name=artist_genres[artist_id]['name'], genres=artist_genres[artist_id]['genres']) 

# add edges
for _, row in filtered_edges.iterrows():
    G.add_edge(row['source_id'], row['target_id'], release_year=row['release_year'])

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")



Filtered edges count: 45596
        source_id target_id  release_year
723768     115466   1165499          2000
3519660    833554    454293          2000
3519661    882018    859036          2000
4283352    967916   1674540          2000
1304982    396036   2816584          2000
Graph has 1000 nodes and 4208 edges.


### Prediction Design

We want to create a hyprid, weighted model:     Score(u, v) = w1 * CN + w2 * JC + w3 * GO + w4 * LC

In [28]:
# # ========== CALCULATE CN FOR TEST EDGES ==========
# # Create graph from train edges
# # --- IGNORE ---
# # Calculate Common Neighbors for each edge in the test set
# cn_scores = []
# for _, row in test_edges.iterrows():
#     u = row['source_id']
#     v = row['target_id']

#     if u not in G or v not in G:
#         cn_scores.append(0)     # or np.nan
#         continue

#     cn = len(list(nx.common_neighbors(G, u, v)))
#     cn_scores.append(cn)

# test_edges['common_neighbors'] = cn_scores
# # print(test_edges[['source_id', 'target_id', 'common_neighbors']].head())

# # ========== CALCULATE JACARD FOR TEST EDGES ==========
# # Calculate Jaccard Coefficient for each edge in the test set
# jaccard_scores = []
# for _, row in test_edges.iterrows():
#     u = row['source_id']
#     v = row['target_id']

#     if u not in G or v not in G:
#         jaccard_scores.append(0)     # or np.nan
#         continue

#     preds = nx.jaccard_coefficient(G, [(u, v)])
#     for _, _, p in preds:
#         jaccard_scores.append(p)
# test_edges['jaccard_coefficient'] = jaccard_scores
# # print(test_edges[['source_id', 'target_id', 'jaccard_coefficient']].head())

# # ========== CALCULATE GENRE OVERLAP FOR TEST EDGES ==========
# # Calculate Genre Overlap for each edge in the test set
# genre_overlap_scores = []
# for _, row in test_edges.iterrows():
#     u = row['source_id']
#     v = row['target_id']

#     genres_u = set(artist_genres[u]['genres'])
#     genres_v = set(artist_genres[v]['genres'])

#     if not genres_u or not genres_v:
#         genre_overlap_scores.append(0)
#         continue

#     intersection = genres_u.intersection(genres_v)
#     union = genres_u.union(genres_v)

#     overlap_score = len(intersection) / len(union)
#     genre_overlap_scores.append(overlap_score)

# test_edges['genre_overlap'] = genre_overlap_scores
# # print(test_edges[['source_id', 'target_id', 'genre_overlap']].head())

# # ========== CALCULATE COMMUNITY OVERLAP FOR TEST EDGES ==========
# # First, detect communities in the training graph using the Louvain method
# # !pip install python-louvain
# import community.community_louvain as community_louvain
# partition = community_louvain.best_partition(G)   
# # Calculate Community Overlap for each edge in the test set
# community_overlap_scores = []
# for _, row in test_edges.iterrows():    
#     u = row['source_id']
#     v = row['target_id']

#     community_u = partition.get(u, -1)
#     community_v = partition.get(v, -1)

#     if community_u == -1 or community_v == -1:
#         community_overlap_scores.append(0)
#         continue

#     overlap_score = 1 if community_u == community_v else 0
#     community_overlap_scores.append(overlap_score)
# test_edges['community_overlap'] = community_overlap_scores
# # print(test_edges[['source_id', 'target_id', 'community_overlap']].head())

# # ========== PRINT FINAL TEST EDGES WITH ALL SCORES ==========
# # print in table format
# print(test_edges[['source_id', 'target_id', 'common_neighbors', 'jaccard_coefficient', 'genre_overlap', 'community_overlap']].head(10))


import itertools
import pandas as pd
import networkx as nx

# --- 1. Get all node pairs ---
all_nodes = list(G.nodes())
all_pairs = list(itertools.combinations(all_nodes, 2))  # undirected, no self-loops

# --- 2. Precompute clustering/community info if needed ---
partition = community_louvain.best_partition(G)
clustering = nx.clustering(G)  # optional if you want local clustering later

rows = []
for u, v in all_pairs:
    # Common Neighbors
    cn = len(list(nx.common_neighbors(G, u, v)))

    # Jaccard
    jaccard = list(nx.jaccard_coefficient(G, [(u, v)]))[0][2]

    # Genre Overlap
    genres_u = set(artist_genres.get(u, {}).get('genres', []))
    genres_v = set(artist_genres.get(v, {}).get('genres', []))
    if genres_u and genres_v:
        go = len(genres_u & genres_v) / len(genres_u | genres_v)
    else:
        go = 0

    # Community Overlap
    cu = partition.get(u, -1)
    cv = partition.get(v, -1)
    co = 1 if cu != -1 and cu == cv else 0

    # Edge label (1 if edge exists, 0 otherwise)
    label = 1 if G.has_edge(u, v) else 0

    rows.append((u, v, cn, jaccard, go, co, label))

# --- 3. Make DataFrame ---
link_pred_df = pd.DataFrame(rows, columns=[
    'source_id', 'target_id', 'common_neighbors', 
    'jaccard_coefficient', 'genre_overlap', 'community_overlap', 'edge_exists'
])


#normalize common neighbors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
link_pred_df['common_neighbors'] = scaler.fit_transform(link_pred_df[['common_neighbors']])

print(link_pred_df.head())





  source_id target_id  common_neighbors  jaccard_coefficient  genre_overlap  \
0    115466    833554          0.157025             0.226190       0.333333   
1    115466      1768          0.000000             0.000000       0.000000   
2    115466    882018          0.107438             0.166667       0.000000   
3    115466    967916          0.008264             0.015152       0.000000   
4    115466    265660          0.008264             0.015152       0.000000   

   community_overlap  edge_exists  
0                  0            0  
1                  0            0  
2                  0            0  
3                  0            0  
4                  0            0  


In [29]:
# ========== BACKTESTING THE FULL LINK PREDICTION MODEL ==========
def predict(common_neighbors, jaccard_coefficient, genre_overlap, community_overlap, CN_WEIGHT, JACCARD_WEIGHT, GENRE_WEIGHT, COMMUNITY_WEIGHT):
    # Simple weighted sum model
    score = (CN_WEIGHT * common_neighbors +
             JACCARD_WEIGHT * jaccard_coefficient +
             GENRE_WEIGHT * genre_overlap +
             COMMUNITY_WEIGHT * community_overlap)
    
    return score

# backtest each parameter betweei n 0 and 1 with step size 0.2
best_accuracy = 0
best_params = None

for CN_WEIGHT in [0, 0.2, 0.4, 0.6, 0.8, 1]:
    for JACCARD_WEIGHT in [0, 0.2, 0.4, 0.6, 0.8, 1]:
        for GENRE_WEIGHT in [0, 0.2, 0.4, 0.6, 0.8, 1]:
            for COMMUNITY_WEIGHT in [0, 0.2, 0.4, 0.6, 0.8, 1]:
                # Normalize weights
                total_weight = CN_WEIGHT + JACCARD_WEIGHT + GENRE_WEIGHT + COMMUNITY_WEIGHT
                if total_weight == 0:
                    continue
                CN_W = CN_WEIGHT / total_weight
                JACCARD_W = JACCARD_WEIGHT / total_weight
                GENRE_W = GENRE_WEIGHT / total_weight
                COMMUNITY_W = COMMUNITY_WEIGHT / total_weight
                
                # Make predictions
                link_pred_df['predicted_score'] = link_pred_df.apply(
                    lambda row: predict(
                        row['common_neighbors'], 
                        row['jaccard_coefficient'], 
                        row['genre_overlap'], 
                        row['community_overlap'],
                        CN_W,
                        JACCARD_W,
                        GENRE_W,
                        COMMUNITY_W
                    ), axis=1)
                
                # Classify based on threshold 0.5
                link_pred_df['predicted_label'] = (link_pred_df['predicted_score'] >= 0.5).astype(int)
                
                # Calculate accuracy
                accuracy = (link_pred_df['predicted_label'] == link_pred_df['edge_exists']).mean()
                
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = (CN_W, JACCARD_W, GENRE_W, COMMUNITY_W)
                    
print(f"Best Accuracy: {best_accuracy}")
print(f"Best Parameters: CN_WEIGHT={best_params[0]}, JACCARD_WEIGHT={best_params[1]}, GENRE_WEIGHT={best_params[2]}, COMMUNITY_WEIGHT={best_params[3]}")




Best Accuracy: 0.9933253253253254
Best Parameters: CN_WEIGHT=0.6, JACCARD_WEIGHT=0.0, GENRE_WEIGHT=0.0, COMMUNITY_WEIGHT=0.4


In [30]:
# ========== AUC-ROC EVALUATION ==========
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
print(f"AUC-ROC: {auc}")

# ========== Precision-Recall EVALUATION ==========
from sklearn.metrics import precision_recall_curve, auc as auc_metric
precision, recall, _ = precision_recall_curve(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
pr_auc = auc_metric(recall, precision)
print(f"Precision-Recall AUC: {pr_auc}")

# ========== Hits@K EVALUATION ==========   
def hits_at_k(df, k):
    # Sort by predicted score descending
    df_sorted = df.sort_values(by='predicted_score', ascending=False)
    
    # Get top K predictions
    top_k = df_sorted.head(k)
    
    # Calculate Hits@K
    hits = top_k['edge_exists'].sum()
    total_positives = df['edge_exists'].sum()
    
    return hits / total_positives if total_positives > 0 else 0 
hits_k = 100
hits_at_100 = hits_at_k(link_pred_df, hits_k)
print(f"Hits@{hits_k}: {hits_at_100}")



AUC-ROC: 0.9854267184587799
Precision-Recall AUC: 0.456117109078475
Hits@100: 0.023727137913989126
