# Combining Learnings

Through testing with strucutral-based link prediction and community-based link prediction, we've seen that an optimal combination of both of these techniques should yield a more optimal model. Here, we will implement this:

For this model testing we will only work with 1000 artists from era 5 with successful API calls. 


In [1]:
# Note: Imports

import pandas as pd
import networkx as nx
import numpy as np

artists = pd.read_parquet('../data/final_data_processed/discogs_artists.parquet')
edges = pd.read_parquet('../data/final_data_processed/discogs_edges.parquet') # Note: These will be our nodes

edges.shape, artists.shape

((5673764, 3), (1045947, 1))

In [2]:
# Note: First we will sort the edges chronologically
edges_sorted = edges.sort_values(by='release_year')
edges_sorted[:100]

era5_edges = edges_sorted[(edges_sorted['release_year'] >= 2000) & (edges_sorted['release_year'] <= 2025)]

era5_artists = pd.DataFrame({'discogs_artist_id': pd.unique(pd.concat([era5_edges.source_id, era5_edges.target_id]))})
print('Era 5')
print(len(era5_edges), len(era5_artists))
print(era5_artists.head())


Era 5
2338167 620672
  discogs_artist_id
0            115466
1            833554
2              1768
3             65718
4            882018


In [3]:
# sample for 1000 nodes that have API Call Hits 

# ========== Get Artist Names ==========
import xml.etree.ElementTree as ET

df_artistsID = pd.read_parquet("../data/final_data_processed/discogs_artists.parquet")

# print(df.head())

# read the artists xml file so we can use the artist IDs from the parquet file to get artist names
rows = []
for event, elem in ET.iterparse("../data_raw/discogs_20251101_artists.xml", events=("end",)):
    if elem.tag == "artist":   # repeating entry tag for Discogs artists
        row = {child.tag: child.text for child in elem}

        rows.append(row)

        # free memory
        elem.clear()
        parent = elem.getparent() if hasattr(elem, "getparent") else None
        if parent is not None:
            while parent.getprevious() is not None:
                del parent[0]

df_artistsNames = pd.DataFrame(rows)
print(df_artistsNames.head(-5))



# ========== SPOTIFY API FUNCTIONS ==========
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()  # loads .env into environment

client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

auth_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

sp = spotipy.Spotify(auth_manager=auth_manager)

# testing to see if the API call works
artist = sp.artist("1uNFoZAHBGtllmzznpCI3s")  # Justin Bieber ID
print(artist["name"])
# method 
def get_artist(artist, market):
    results = sp.search(q=f"artist:{artist}", type="artist", limit=1, market=market)
    items = results.get("artists", {}).get("items", [])

    
    return items[0] if items else None


def get_artist_data(artist_name, market):
    artist = get_artist(artist_name, market)
    if artist:
        return artist.get("genres", [])
    else:
        return None
    



# ========== GET 1000 ARTISTS WITH SUCCESSFUL SPOTIFY API CALLS ==========
artist_genres = {}

for index, row in era5_artists.iterrows():
    artist_id = row['discogs_artist_id']
    artist_name_row = df_artistsNames[df_artistsNames['id'] == str(artist_id)]
    if not artist_name_row.empty:
        artist_name = artist_name_row.iloc[0]['name']
        genres = get_artist_data(artist_name, market="US")
        if genres is not None:
            artist_genres[artist_id] = {
                'name': artist_name,
                'genres': genres
            }
    if len(artist_genres) >= 1000:
        break

print(f"Successfully retrieved genres for {len(artist_genres)} artists.")
print(list(artist_genres.items())[:5])  # Print first 5 entries




FileNotFoundError: [Errno 2] No such file or directory: '../data_raw/discogs_20251101_artists.xml'

In [None]:
# Get all edges where both source and target artists are in artist_genres
filtered_edges = era5_edges[
    era5_edges['source_id'].isin(artist_genres.keys()) &
    era5_edges['target_id'].isin(artist_genres.keys())
]

print(f"Filtered edges count: {len(filtered_edges)}")
print(filtered_edges.head())

# Create the graph with the filtered edges and nodes
# add nodes
G = nx.Graph()

for artist_id in artist_genres.keys():
    G.add_node(artist_id, name=artist_genres[artist_id]['name'], genres=artist_genres[artist_id]['genres']) 

# add edges
for _, row in filtered_edges.iterrows():
    G.add_edge(row['source_id'], row['target_id'], release_year=row['release_year'])

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")



Filtered edges count: 45596
        source_id target_id  release_year
723768     115466   1165499          2000
3519660    833554    454293          2000
3519661    882018    859036          2000
4283352    967916   1674540          2000
1304982    396036   2816584          2000
Graph has 1000 nodes and 4208 edges.


### Prediction Design

We want to create a hybrid, weighted model combining structural and genre-based features:

**Model:** Score(u, v) = w1×CN + w2×JC + w3×GenreJaccard + w4×GenreCount + w5×SameGenre + w6×CommOverlap

Where:
- **CN** = Common Neighbors (structural)
- **JC** = Jaccard Coefficient (structural)
- **GenreJaccard** = Genre similarity (Jaccard on genre sets)
- **GenreCount** = Raw count of shared genres
- **SameGenre** = Binary flag (1 if any shared genre)
- **CommOverlap** = Community co-membership (binary)

In [None]:
import itertools
import pandas as pd
import networkx as nx

# --- 1. Get all node pairs ---
all_nodes = list(G.nodes())
all_pairs = list(itertools.combinations(all_nodes, 2))  # undirected, no self-loops

# --- 2. Precompute clustering/community info if needed ---
partition = community_louvain.best_partition(G)
clustering = nx.clustering(G)  # optional if you want local clustering later

rows = []
for u, v in all_pairs:
    # Common Neighbors
    cn = len(list(nx.common_neighbors(G, u, v)))

    # Jaccard
    jaccard = list(nx.jaccard_coefficient(G, [(u, v)]))[0][2]

    # Genre Features (3 types)
    genres_u = set(artist_genres.get(u, {}).get('genres', []))
    genres_v = set(artist_genres.get(v, {}).get('genres', []))
    
    intersection = genres_u & genres_v
    union = genres_u | genres_v
    
    # Genre Jaccard (normalized overlap)
    if union:
        genre_jaccard = len(intersection) / len(union)
    else:
        genre_jaccard = 0
    
    # Genre Overlap (raw count)
    genre_overlap_count = len(intersection)
    
    # Same Genre Flag (binary)
    same_genre = 1 if len(intersection) > 0 else 0

    # Community Overlap
    cu = partition.get(u, -1)
    cv = partition.get(v, -1)
    co = 1 if cu != -1 and cu == cv else 0

    # Edge label (1 if edge exists, 0 otherwise)
    label = 1 if G.has_edge(u, v) else 0

    rows.append((u, v, cn, jaccard, genre_jaccard, genre_overlap_count, same_genre, co, label))

# --- 3. Make DataFrame ---
link_pred_df = pd.DataFrame(rows, columns=[
    'source_id', 'target_id', 'common_neighbors', 
    'jaccard_coefficient', 'genre_jaccard', 'genre_overlap_count', 'same_genre',
    'community_overlap', 'edge_exists'
])

# Normalize common neighbors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
link_pred_df['common_neighbors'] = scaler.fit_transform(link_pred_df[['common_neighbors']])

print(link_pred_df.head())
print(f"\nFeatures: {[col for col in link_pred_df.columns if col not in ['source_id', 'target_id', 'edge_exists']]}")

  source_id target_id  common_neighbors  jaccard_coefficient  genre_overlap  \
0    115466    833554          0.157025             0.226190       0.333333   
1    115466      1768          0.000000             0.000000       0.000000   
2    115466    882018          0.107438             0.166667       0.000000   
3    115466    967916          0.008264             0.015152       0.000000   
4    115466    265660          0.008264             0.015152       0.000000   

   community_overlap  edge_exists  
0                  0            0  
1                  0            0  
2                  0            0  
3                  0            0  
4                  0            0  


In [None]:
# ========== BACKTESTING THE FULL LINK PREDICTION MODEL ==========
def predict(common_neighbors, jaccard_coefficient, genre_jaccard, genre_overlap_count, same_genre, community_overlap, 
            CN_WEIGHT, JACCARD_WEIGHT, GENRE_JACCARD_WEIGHT, GENRE_COUNT_WEIGHT, SAME_GENRE_WEIGHT, COMMUNITY_WEIGHT):
    # Weighted sum model with all features
    score = (CN_WEIGHT * common_neighbors +
             JACCARD_WEIGHT * jaccard_coefficient +
             GENRE_JACCARD_WEIGHT * genre_jaccard +
             GENRE_COUNT_WEIGHT * genre_overlap_count +
             SAME_GENRE_WEIGHT * same_genre +
             COMMUNITY_WEIGHT * community_overlap)
    
    return score

# Normalize genre_overlap_count for fair comparison
scaler_genre = MinMaxScaler()
link_pred_df['genre_overlap_count_norm'] = scaler_genre.fit_transform(link_pred_df[['genre_overlap_count']])

# Backtest each parameter between 0 and 1 with step size 0.2
best_accuracy = 0
best_params = None

print("Starting hyperparameter search...")
param_range = [0, 0.2, 0.4, 0.6, 0.8, 1]

for CN_WEIGHT in param_range:
    for JACCARD_WEIGHT in param_range:
        for GENRE_JACCARD_WEIGHT in param_range:
            for GENRE_COUNT_WEIGHT in param_range:
                for SAME_GENRE_WEIGHT in param_range:
                    for COMMUNITY_WEIGHT in param_range:
                        # Normalize weights
                        total_weight = (CN_WEIGHT + JACCARD_WEIGHT + GENRE_JACCARD_WEIGHT + 
                                      GENRE_COUNT_WEIGHT + SAME_GENRE_WEIGHT + COMMUNITY_WEIGHT)
                        if total_weight == 0:
                            continue
                        
                        CN_W = CN_WEIGHT / total_weight
                        JACCARD_W = JACCARD_WEIGHT / total_weight
                        GENRE_JACCARD_W = GENRE_JACCARD_WEIGHT / total_weight
                        GENRE_COUNT_W = GENRE_COUNT_WEIGHT / total_weight
                        SAME_GENRE_W = SAME_GENRE_WEIGHT / total_weight
                        COMMUNITY_W = COMMUNITY_WEIGHT / total_weight
                        
                        # Make predictions
                        link_pred_df['predicted_score'] = link_pred_df.apply(
                            lambda row: predict(
                                row['common_neighbors'], 
                                row['jaccard_coefficient'], 
                                row['genre_jaccard'],
                                row['genre_overlap_count_norm'],
                                row['same_genre'],
                                row['community_overlap'],
                                CN_W, JACCARD_W, GENRE_JACCARD_W, GENRE_COUNT_W, SAME_GENRE_W, COMMUNITY_W
                            ), axis=1)
                        
                        # Classify based on threshold 0.5
                        link_pred_df['predicted_label'] = (link_pred_df['predicted_score'] >= 0.5).astype(int)
                        
                        # Calculate accuracy
                        accuracy = (link_pred_df['predicted_label'] == link_pred_df['edge_exists']).mean()
                        
                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_params = (CN_W, JACCARD_W, GENRE_JACCARD_W, GENRE_COUNT_W, SAME_GENRE_W, COMMUNITY_W)
                    
print(f"\nBest Accuracy: {best_accuracy:.4f}")
print(f"Best Parameters:")
print(f"  CN_WEIGHT: {best_params[0]:.3f}")
print(f"  JACCARD_WEIGHT: {best_params[1]:.3f}")
print(f"  GENRE_JACCARD_WEIGHT: {best_params[2]:.3f}")
print(f"  GENRE_COUNT_WEIGHT: {best_params[3]:.3f}")
print(f"  SAME_GENRE_WEIGHT: {best_params[4]:.3f}")
print(f"  COMMUNITY_WEIGHT: {best_params[5]:.3f}")

Best Accuracy: 0.9933253253253254
Best Parameters: CN_WEIGHT=0.6, JACCARD_WEIGHT=0.0, GENRE_WEIGHT=0.0, COMMUNITY_WEIGHT=0.4


In [None]:
# ========== AUC-ROC EVALUATION ==========
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
print(f"AUC-ROC: {auc}")

# ========== Precision-Recall EVALUATION ==========
from sklearn.metrics import precision_recall_curve, auc as auc_metric
precision, recall, _ = precision_recall_curve(link_pred_df['edge_exists'], link_pred_df['predicted_score'])
pr_auc = auc_metric(recall, precision)
print(f"Precision-Recall AUC: {pr_auc}")

# ========== Hits@K EVALUATION ==========   
def hits_at_k(df, k):
    # Sort by predicted score descending
    df_sorted = df.sort_values(by='predicted_score', ascending=False)
    
    # Get top K predictions
    top_k = df_sorted.head(k)
    
    # Calculate Hits@K
    hits = top_k['edge_exists'].sum()
    total_positives = df['edge_exists'].sum()
    
    return hits / total_positives if total_positives > 0 else 0 
hits_k = 100
hits_at_100 = hits_at_k(link_pred_df, hits_k)
print(f"Hits@{hits_k}: {hits_at_100}")



AUC-ROC: 0.9854267184587799
Precision-Recall AUC: 0.456117109078475
Hits@100: 0.023727137913989126
