# Combining Learnings

Through testing with strucutral-based link prediction and community-based link prediction, we've seen that an optimal combination of both of these techniques should yield a more optimal model. Here, we will implement this:

For this model testing we will only work with 1000 artists from era 5 with successful API calls. 


In [9]:
# Note: Imports

import pandas as pd
import networkx as nx
import numpy as np

artists = pd.read_parquet('../data/final_data_processed/discogs_artists.parquet')
edges = pd.read_parquet('../data/final_data_processed/discogs_edges.parquet') # Note: These will be our nodes

edges.shape, artists.shape

((5673764, 3), (1045947, 1))

In [10]:
# Note: First we will sort the edges chronologically
edges_sorted = edges.sort_values(by='release_year')
edges_sorted[:100]

era5_edges = edges_sorted[(edges_sorted['release_year'] >= 2000) & (edges_sorted['release_year'] <= 2025)]

era5_artists = pd.DataFrame({'discogs_artist_id': pd.unique(pd.concat([era5_edges.source_id, era5_edges.target_id]))})
print('Era 5')
print(len(era5_edges), len(era5_artists))
print(era5_artists.head())


Era 5
2338167 620672
  discogs_artist_id
0            115466
1            833554
2              1768
3             65718
4            882018


In [11]:
# sample for 1000 nodes that have API Call Hits 

# ========== Get Artist Names ==========
import xml.etree.ElementTree as ET

df_artistsID = pd.read_parquet("../data/final_data_processed/discogs_artists.parquet")

# print(df.head())

# read the artists xml file so we can use the artist IDs from the parquet file to get artist names
rows = []
for event, elem in ET.iterparse("../data_raw/discogs_20251101_artists.xml", events=("end",)):
    if elem.tag == "artist":   # repeating entry tag for Discogs artists
        row = {child.tag: child.text for child in elem}

        rows.append(row)

        # free memory
        elem.clear()
        parent = elem.getparent() if hasattr(elem, "getparent") else None
        if parent is not None:
            while parent.getprevious() is not None:
                del parent[0]

df_artistsNames = pd.DataFrame(rows)
print(df_artistsNames.head(-5))



# ========== SPOTIFY API FUNCTIONS ==========
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()  # loads .env into environment

client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

auth_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

sp = spotipy.Spotify(auth_manager=auth_manager)

# testing to see if the API call works
artist = sp.artist("1uNFoZAHBGtllmzznpCI3s")  # Justin Bieber ID
print(artist["name"])
# method 
def get_artist(artist, market):
    results = sp.search(q=f"artist:{artist}", type="artist", limit=1, market=market)
    items = results.get("artists", {}).get("items", [])

    
    return items[0] if items else None


def get_artist_data(artist_name, market):
    artist = get_artist(artist_name, market)
    if artist:
        return artist.get("genres", [])
    else:
        return None
    



# ========== GET 1000 ARTISTS WITH SUCCESSFUL SPOTIFY API CALLS ==========
artist_genres = {}

for index, row in era5_artists.iterrows():
    artist_id = row['discogs_artist_id']
    artist_name_row = df_artistsNames[df_artistsNames['id'] == str(artist_id)]
    if not artist_name_row.empty:
        artist_name = artist_name_row.iloc[0]['name']
        genres = get_artist_data(artist_name, market="US")
        if genres is not None:
            artist_genres[artist_id] = {
                'name': artist_name,
                'genres': genres
            }
    if len(artist_genres) >= 1000:
        break

print(f"Successfully retrieved genres for {len(artist_genres)} artists.")
print(list(artist_genres.items())[:5])  # Print first 5 entries




               id                    name         realname  \
0               1           The Persuader  Jesper Dahlbäck   
1               2  Mr. James Barth & A.D.              NaN   
2               3               Josh Wink   Josh Winkelman   
3               4           Johannes Heil    Johannes Heil   
4               5              Heiko Laux       Heiko Laux   
...           ...                     ...              ...   
9798342  16856335             Izumi Kohki              NaN   
9798343  16856338                  Kate08        Kate Webb   
9798344  16856341   The Evil B-Side Twins              NaN   
9798345  16856347             Carol Lundy              NaN   
9798346  16856350            문선 (Moonsun)              NaN   

                                                   profile  \
0        Electronic artist working out of Stockholm, ac...   
1                                                      NaN   
2        Electronic music DJ, label owner, producer, an...   
3      

In [12]:
# Get all edges where both source and target artists are in artist_genres
filtered_edges = era5_edges[
    era5_edges['source_id'].isin(artist_genres.keys()) &
    era5_edges['target_id'].isin(artist_genres.keys())
]

print(f"Filtered edges count: {len(filtered_edges)}")
print(filtered_edges.head())

# Create the graph with the filtered edges and nodes
# add nodes
G = nx.Graph()

for artist_id in artist_genres.keys():
    G.add_node(artist_id, name=artist_genres[artist_id]['name'], genres=artist_genres[artist_id]['genres']) 

# add edges
for _, row in filtered_edges.iterrows():
    G.add_edge(row['source_id'], row['target_id'], release_year=row['release_year'])

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")



Filtered edges count: 45596
        source_id target_id  release_year
723768     115466   1165499          2000
3519660    833554    454293          2000
3519661    882018    859036          2000
4283352    967916   1674540          2000
1304982    396036   2816584          2000
Graph has 1000 nodes and 4208 edges.


### Prediction Design

We want to create a hybrid, weighted model combining structural and genre-based features:

**Model:** Score(u, v) = w1×CN + w2×JC + w3×GenreJaccard + w4×GenreCount + w5×SameGenre + w6×CommOverlap

Where:
- **CN** = Common Neighbors (structural)
- **JC** = Jaccard Coefficient (structural)
- **GenreJaccard** = Genre similarity (Jaccard on genre sets)
- **GenreCount** = Raw count of shared genres
- **SameGenre** = Binary flag (1 if any shared genre)
- **CommOverlap** = Community co-membership (binary)

In [13]:
import itertools
import pandas as pd
import networkx as nx
import community as community_louvain

# --- 1. Get all node pairs ---
all_nodes = list(G.nodes())
all_pairs = list(itertools.combinations(all_nodes, 2))  # undirected, no self-loops

# --- 2. Precompute clustering/community info if needed ---
partition = community_louvain.best_partition(G)
clustering = nx.clustering(G)  # optional if you want local clustering later

rows = []
for u, v in all_pairs:
    # Common Neighbors
    cn = len(list(nx.common_neighbors(G, u, v)))

    # Jaccard
    jaccard = list(nx.jaccard_coefficient(G, [(u, v)]))[0][2]

    # Genre Features (3 types)
    genres_u = set(artist_genres.get(u, {}).get('genres', []))
    genres_v = set(artist_genres.get(v, {}).get('genres', []))
    
    intersection = genres_u & genres_v
    union = genres_u | genres_v
    
    # Genre Jaccard (normalized overlap)
    if union:
        genre_jaccard = len(intersection) / len(union)
    else:
        genre_jaccard = 0
    
    # Genre Overlap (raw count)
    genre_overlap_count = len(intersection)
    
    # Same Genre Flag (binary)
    same_genre = 1 if len(intersection) > 0 else 0

    # Community Overlap
    cu = partition.get(u, -1)
    cv = partition.get(v, -1)
    co = 1 if cu != -1 and cu == cv else 0

    # Edge label (1 if edge exists, 0 otherwise)
    label = 1 if G.has_edge(u, v) else 0

    rows.append((u, v, cn, jaccard, genre_jaccard, genre_overlap_count, same_genre, co, label))

# --- 3. Make DataFrame ---
link_pred_df = pd.DataFrame(rows, columns=[
    'source_id', 'target_id', 'common_neighbors', 
    'jaccard_coefficient', 'genre_jaccard', 'genre_overlap_count', 'same_genre',
    'community_overlap', 'edge_exists'
])

# Normalize common neighbors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
link_pred_df['common_neighbors'] = scaler.fit_transform(link_pred_df[['common_neighbors']])

print(link_pred_df.head())
print(link_pred_df.shape)
print(f"\nFeatures: {[col for col in link_pred_df.columns if col not in ['source_id', 'target_id', 'edge_exists']]}")

  source_id target_id  common_neighbors  jaccard_coefficient  genre_jaccard  \
0    115466    833554          0.157025             0.226190       0.333333   
1    115466      1768          0.000000             0.000000       0.000000   
2    115466    882018          0.107438             0.166667       0.000000   
3    115466    967916          0.008264             0.015152       0.000000   
4    115466    265660          0.008264             0.015152       0.000000   

   genre_overlap_count  same_genre  community_overlap  edge_exists  
0                    1           1                  1            0  
1                    0           0                  0            0  
2                    0           0                  1            0  
3                    0           0                  0            0  
4                    0           0                  0            0  
(499500, 9)

Features: ['common_neighbors', 'jaccard_coefficient', 'genre_jaccard', 'genre_overlap_count', 'sam

In [14]:
# ========== BACKTESTING THE FULL LINK PREDICTION MODEL ==========
def predict(common_neighbors, jaccard_coefficient, genre_jaccard, genre_overlap_count, same_genre, community_overlap, 
            CN_WEIGHT, JACCARD_WEIGHT, GENRE_JACCARD_WEIGHT, GENRE_COUNT_WEIGHT, SAME_GENRE_WEIGHT, COMMUNITY_WEIGHT):
    # Weighted sum model with all features
    score = (CN_WEIGHT * common_neighbors +
             JACCARD_WEIGHT * jaccard_coefficient +
             GENRE_JACCARD_WEIGHT * genre_jaccard +
             GENRE_COUNT_WEIGHT * genre_overlap_count +
             SAME_GENRE_WEIGHT * same_genre +
             COMMUNITY_WEIGHT * community_overlap)
    
    return score

# Normalize genre_overlap_count for fair comparison
scaler_genre = MinMaxScaler()
link_pred_df['genre_overlap_count_norm'] = scaler_genre.fit_transform(link_pred_df[['genre_overlap_count']])

# assume linear relationship (for ease of testing and efficiency of code)
# iterate each weight from 0 to 1 with step size 0.1 holding fixed othes at 0.5

optimal_weights = {}

for iterating_val in ['CN_WEIGHT', 'JACCARD_WEIGHT', 'GENRE_JACCARD_WEIGHT', 'GENRE_COUNT_WEIGHT', 'SAME_GENRE_WEIGHT', 'COMMUNITY_WEIGHT']:

    # store best hit value
    best_hits = 0
    optimal_weights[iterating_val] = 0

    for weight in np.arange(0, 1.1, 0.1):
        weights = {
            'CN_WEIGHT': 0.5,
            'JACCARD_WEIGHT': 0.5,
            'GENRE_JACCARD_WEIGHT': 0.5,
            'GENRE_COUNT_WEIGHT': 0.5,
            'SAME_GENRE_WEIGHT': 0.5,
            'COMMUNITY_WEIGHT': 0.5
        }
        weights[iterating_val] = weight

        link_pred_df['predicted_score'] = link_pred_df.apply(
            lambda row: predict(
                row['common_neighbors'],
                row['jaccard_coefficient'],
                row['genre_jaccard'],
                row['genre_overlap_count_norm'],
                row['same_genre'],
                row['community_overlap'],
                weights['CN_WEIGHT'],
                weights['JACCARD_WEIGHT'],
                weights['GENRE_JACCARD_WEIGHT'],
                weights['GENRE_COUNT_WEIGHT'],
                weights['SAME_GENRE_WEIGHT'],
                weights['COMMUNITY_WEIGHT']
            ), axis=1
        )

        print(f"\nEvaluating with {iterating_val} = {weight:.1f}")
        
        #calculate hits@k
        K = 10000
        top_k = link_pred_df.nlargest(K, 'predicted_score')
        hits = top_k['edge_exists'].sum()
        hits_at_k = hits / len(edges)
        print(f"Hits@{K}: {hits_at_k:.8f}")

        if hits_at_k > best_hits:
            best_hits = hits_at_k
            optimal_weights[iterating_val] = weight

print("Optimal Weights:")
for key, value in optimal_weights.items():
    print(f"{key}: {value}")

# print(link_pred_df.head())


Evaluating with CN_WEIGHT = 0.0
Hits@10000: 0.00045367

Evaluating with CN_WEIGHT = 0.1
Hits@10000: 0.00045754

Evaluating with CN_WEIGHT = 0.2
Hits@10000: 0.00046054

Evaluating with CN_WEIGHT = 0.3
Hits@10000: 0.00046777

Evaluating with CN_WEIGHT = 0.4
Hits@10000: 0.00047358

Evaluating with CN_WEIGHT = 0.5
Hits@10000: 0.00047975

Evaluating with CN_WEIGHT = 0.6
Hits@10000: 0.00048645

Evaluating with CN_WEIGHT = 0.7
Hits@10000: 0.00049227

Evaluating with CN_WEIGHT = 0.8
Hits@10000: 0.00049667

Evaluating with CN_WEIGHT = 0.9
Hits@10000: 0.00050214

Evaluating with CN_WEIGHT = 1.0
Hits@10000: 0.00050425

Evaluating with JACCARD_WEIGHT = 0.0
Hits@10000: 0.00044785

Evaluating with JACCARD_WEIGHT = 0.1
Hits@10000: 0.00045032

Evaluating with JACCARD_WEIGHT = 0.2
Hits@10000: 0.00045279

Evaluating with JACCARD_WEIGHT = 0.3
Hits@10000: 0.00045825

Evaluating with JACCARD_WEIGHT = 0.4
Hits@10000: 0.00046830

Evaluating with JACCARD_WEIGHT = 0.5
Hits@10000: 0.00047975

Evaluating with J

In [15]:
# define a evaluation function
from sklearn.metrics import roc_auc_score, precision_score, recall_score

def evaluate(df, graphMetric):
    # ----- AUC -----
    auc = roc_auc_score(df['label'], df[graphMetric])
    print(f"AUC for {graphMetric}: {auc}")

    # ----- Precision -----
    y_pred = [1 if s >= 0.5 else 0 for s in df[graphMetric]]
    precision = precision_score(df['label'], y_pred)
    print(f"Precision: {precision}")    

    # ----- Recall -----
    recall = recall_score(df['label'], y_pred)
    print(f"Recall: {recall}")  

    # ----- Hits@K -----
    K = 1000
    edges = list(zip(df['u'], df['v']))
    predicted_edges = pd.DataFrame({
        'edge': edges,
        'score': df[graphMetric]
    })

    # Sort by score descending
    predicted_edges_sorted = predicted_edges.sort_values('score', ascending=False)

    # Take top K edges
    top_k_edges = set(predicted_edges_sorted['edge'].iloc[:K])

    # Count how many true edges are in top K
    hits = sum(1 for edge in edges if edge in top_k_edges)

    # Compute Hits@K
    hits_at_k = hits / len(edges)
    print(f"Hits@{K}: {hits_at_k:.4f}")

    # ----- Score Distribution -----
    plt.hist(df[graphMetric], bins=50)
    plt.title("Score Distribution")
    plt.xlabel("Score")
    plt.ylabel("Frequency")
    plt.show()

    return

In [16]:
# ========== AUC-ROC EVALUATION ==========
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
print(f"AUC-ROC: {auc}")

# ========== Precision-Recall EVALUATION ==========
from sklearn.metrics import precision_recall_curve, auc as auc_metric
precision, recall, _ = precision_recall_curve(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
pr_auc = auc_metric(recall, precision)
print(f"Precision-Recall AUC: {pr_auc}")

# ========== Hits@K EVALUATION ==========   
def hits_at_k(df, k):
    # Sort by predicted score descending
    df_sorted = df.sort_values(by='predicted_score', ascending=False)
    
    # Get top K predictions
    top_k = df_sorted.head(k)
    
    # Calculate Hits@K
    hits = top_k['edge_exists'].sum()
    total_positives = df['edge_exists'].sum()
    
    return hits / total_positives if total_positives > 0 else 0 
hits_k = 100
hits_at_100 = hits_at_k(link_pred_df, hits_k)
print(f"Hits@{hits_k}: {hits_at_100}")



AUC-ROC: 0.9856104509915472
Precision-Recall AUC: 0.4401496062625018
Hits@100: 0.022985664854176965


In [24]:
# ========== FINAL EVAL WITH OPTIMAL WEIGHTS ==========

# create mapping of optimal weights
optimal_weights = {
    'CN_WEIGHT': 1.0,
    'JACCARD_WEIGHT': 1.0,
    'GENRE_JACCARD_WEIGHT': 0.0,
    'GENRE_COUNT_WEIGHT': 0.0,
    'SAME_GENRE_WEIGHT': 0.2,
    'COMMUNITY_WEIGHT': 0.8
}

# run predict with optimal weights, store results in new column of dataframe
link_pred_df['final_predicted_score'] = link_pred_df.apply(
    lambda row: predict(
        row['common_neighbors'],
        row['jaccard_coefficient'],
        row['genre_jaccard'],
        row['genre_overlap_count_norm'],
        row['same_genre'],
        row['community_overlap'],
        optimal_weights['CN_WEIGHT'],
        optimal_weights['JACCARD_WEIGHT'],
        optimal_weights['GENRE_JACCARD_WEIGHT'],
        optimal_weights['GENRE_COUNT_WEIGHT'],
        optimal_weights['SAME_GENRE_WEIGHT'],
        optimal_weights['COMMUNITY_WEIGHT']
    ), axis=1
)

# evaulation
from sklearn.metrics import roc_auc_score, precision_score, recall_score, precision_recall_curve, auc as auc_metric
import matplotlib.pyplot as plt

# ========== AUC-ROC EVALUATION ==========
auc_roc = roc_auc_score(link_pred_df['edge_exists'], link_pred_df['final_predicted_score'])
print(f"Final AUC-ROC: {auc_roc}")

# ========== Precision-Recall EVALUATION ==========
precision, recall, _ = precision_recall_curve(link_pred_df['edge_exists'], link_pred_df['final_predicted_score'])
pr_auc = auc_metric(recall, precision)
print(f"Final PR-AUC:  {pr_auc}")

# --- Precision & Recall at Threshold 0.5 (Normalized Score) ---
# Normalize final score to [0,1] for thresholding
min_score = link_pred_df['final_predicted_score'].min()
max_score = link_pred_df['final_predicted_score'].max()
link_pred_df['final_score_norm'] = (link_pred_df['final_predicted_score'] - min_score) / (max_score - min_score)

threshold = 0.5
y_pred = (link_pred_df['final_score_norm'] >= threshold).astype(int)
prec_at_thresh = precision_score(link_pred_df['edge_exists'], y_pred)
rec_at_thresh = recall_score(link_pred_df['edge_exists'], y_pred)

print(f"Precision @ {threshold}: {prec_at_thresh}")
print(f"Recall @ {threshold}:    {rec_at_thresh}")

# ========== Hits@K EVALUATION ==========   
def hits_at_k(df, k, score_col='final_predicted_score', label_col='edge_exists'):
    # Sort by score descending
    top_k = df.nlargest(k, score_col)
    # Count true positives in top K
    hits = top_k[label_col].sum()

    return hits, hits / k
K_values = [10, 100, 1000, 10000, 100000]
print("\nHits@K Analysis:")
for k in K_values:
    hits, prec_at_k = hits_at_k(link_pred_df, k)
    print(f"  Hits@{k}: {hits} (Precision@{k}: {prec_at_k:.4f})")


Final AUC-ROC: 0.986569192336466
Final PR-AUC:  0.5158836687151468
Precision @ 0.5: 0.6678571428571428
Recall @ 0.5:    0.3235294117647059

Hits@K Analysis:
  Hits@10: 10 (Precision@10: 1.0000)
  Hits@100: 100 (Precision@100: 1.0000)
  Hits@1000: 798 (Precision@1000: 0.7980)
  Hits@10000: 2857 (Precision@10000: 0.2857)
  Hits@100000: 4031 (Precision@100000: 0.0403)
