# Library Imports

In [3]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np

# Model Selection and Tuning
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, StandardScaler

# Regression Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# Model Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Load Datasets

In [5]:
df_liked = pd.read_csv("data/liked.csv") #Liked playlist on Spotify
df_fav_albums = pd.read_csv("data/liked_albums.csv") #Albums I've Liked in Recent Years
df_not_liked = pd.read_csv("data/did_not_like.csv") #Albums I've not liked in Recent Years
df_nmf = pd.read_csv("data/nmf.csv") #The most recent New Music Friday Playlist
df_liked_similar = pd.read_csv("data/liked_artists_only_similar.csv") #Lastfm pull of similar artists my recently played artists
df_nmf_similar = pd.read_csv("data/nmf_artist_adjacent.csv") #Lastfm pull of similar artists to this weeks NMF artists

In [6]:
df_liked.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,21dfq9YrLKrCJA4I5OduxJ,Broadcast,Michigander,Michigander,2025-02-07,210348,1,,2025-02-09T02:12:28Z,,...,9,-9.951,1,0.0298,0.916,0.346,0.0909,0.573,116.002,4
1,5FiITXz0a7G5yhEonDZjnE,Stay Home,11:11,Biig Piig,2025-02-07,164000,5,,2025-02-07T15:49:04Z,,...,0,-7.713,1,0.0273,0.383,3.6e-05,0.71,0.592,154.054,4
2,3d34BjXrlp7rOPdGosajUB,Warplane,Glutton For Punishment,Heartworms,2025-02-07,331673,2,,2025-02-07T13:17:04Z,post-punk,...,9,-12.469,0,0.0558,0.00344,0.625,0.0915,0.0681,139.998,4
3,27a1HBHlfXd1o1yEzJd8iy,Mad Catch,Glutton For Punishment,Heartworms,2025-02-07,190576,4,,2025-02-07T13:08:50Z,post-punk,...,0,-9.766,1,0.0765,0.0157,0.000789,0.136,0.744,123.086,4
4,2262bWmqomIaJXwCRHr13j,Sailor Song,Sailor Song,Gigi Perez,2024-07-26,211978,93,,2025-02-04T01:45:03Z,,...,11,-10.432,1,0.0254,0.682,6.7e-05,0.193,0.273,94.938,4


In [7]:
# Liked Albums in Recent Years
df_fav_albums.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,0UOeq7bSskoJa4cJaJOmFS,Ticking,Letter to Self,SPRINTS,2024-01-05,186949,31,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,11.0,-6.49,1.0,0.344,0.025,0.0765,0.0934,0.291,175.574,4.0
1,02bA26OEe0nNFyE3YcNx4K,Heavy,Letter to Self,SPRINTS,2024-01-05,207409,46,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,11.0,-5.925,1.0,0.0591,0.00435,0.000738,0.0877,0.189,88.581,4.0
2,7IPDhCIQlpvxVxtC1Q7Jq4,Cathedral,Letter to Self,SPRINTS,2024-01-05,179694,30,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,7.0,-6.231,1.0,0.0473,0.00978,0.0027,0.0887,0.397,119.056,4.0
3,65fPteG9ctHt2rrJxlbMr8,Shaking Their Hands,Letter to Self,SPRINTS,2024-01-05,222489,28,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,4.0,-5.658,0.0,0.0533,0.199,0.108,0.133,0.551,89.485,4.0
4,4UgkFdXpJD0fhw06BMk0bz,Adore Adore Adore,Letter to Self,SPRINTS,2024-01-05,157766,36,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,4.0,-4.401,0.0,0.257,0.0107,0.000107,0.101,0.402,176.054,4.0


In [8]:
# Albums Not Liked in Recent Years
df_not_liked.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,54KEm0VI9i3ic7VHHKHKRx,¿Cómo Así?,ORQUÍDEAS,Kali Uchis,2024-01-12,169654,57,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,6.0,-7.662,0.0,0.0892,0.0417,0.346,0.154,0.379,135.985,4.0
1,5mVkTPlTPxlQOn7kEvuM3j,Me Pongo Loca,ORQUÍDEAS,Kali Uchis,2024-01-12,177815,54,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,7.0,-8.68,0.0,0.0426,0.0371,0.152,0.106,0.407,114.999,4.0
2,6XaJfhwof7qIgbbXO5tIQI,Igual Que Un Ángel (with Peso Pluma),ORQUÍDEAS,"Kali Uchis,Peso Pluma",2024-01-12,260370,76,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,"corrido,corridos tumbados,corridos bélicos,mús...",...,5.0,-5.34,0.0,0.032,0.00449,0.000663,0.185,0.482,108.001,4.0
3,52x8HIGuk1gGTlvO8CuLNS,Pensamientos Intrusivos,ORQUÍDEAS,Kali Uchis,2024-01-12,192027,60,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,9.0,-8.333,0.0,0.0394,0.575,0.0129,0.11,0.511,119.994,4.0
4,3RleMgz4iO0BNezGdSxDnY,Diosa,ORQUÍDEAS,Kali Uchis,2024-01-12,156037,59,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,5.0,-5.518,0.0,0.0668,0.0675,0.000101,0.078,0.698,107.994,4.0


In [9]:
# New Music Friday Playlist
df_nmf.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,56D5iLqv8WVcQTvyL0N0ay,Live Forever,Sharon Van Etten & The Attachment Theory,Sharon Van Etten,2025-02-07,339867,15,mmr4r23xnc6oh1c77lysfbqg4,2025-02-07T11:39:45Z,,...,0,-7.448,0,0.0367,0.217,0.128,0.107,0.124,147.096,4
1,1DO4aHyMKVW66VWnA4pNeV,Afterlife,Sharon Van Etten & The Attachment Theory,Sharon Van Etten,2025-02-07,248215,13,mmr4r23xnc6oh1c77lysfbqg4,2025-02-07T11:39:45Z,,...,2,-7.523,1,0.0284,0.33,0.0128,0.11,0.205,97.981,4
2,33Wi8Etr6ljOsBViBOKWQ5,Idiot Box,Sharon Van Etten & The Attachment Theory,Sharon Van Etten,2025-02-07,250409,22,mmr4r23xnc6oh1c77lysfbqg4,2025-02-07T11:39:45Z,,...,7,-5.178,1,0.0366,0.278,0.000358,0.108,0.612,144.828,4
3,56nsBD9bmnuW7V5wHlyVCx,Trouble,Sharon Van Etten & The Attachment Theory,Sharon Van Etten,2025-02-07,300045,12,mmr4r23xnc6oh1c77lysfbqg4,2025-02-07T11:39:45Z,,...,11,-9.927,1,0.034,0.488,0.175,0.129,0.346,110.081,4
4,1aUzvglR5E3Jgi0DdyD7nF,Indio,Sharon Van Etten & The Attachment Theory,Sharon Van Etten,2025-02-07,167383,11,mmr4r23xnc6oh1c77lysfbqg4,2025-02-07T11:39:45Z,,...,1,-5.415,1,0.0302,0.00835,3.6e-05,0.29,0.72,88.029,4


In [10]:
# Similar Artists to Recently Played Artists (Last.fm)
df_liked_similar.head()

Unnamed: 0,Artist,Similar Artists
0,Adrian Lyles,"Saylor Bell, Sofia Wylie, Julia Lester, Franki..."
1,Michigander,"Wilderado, Joe P, Krooked Kings, GRMLN, Your N..."
2,Biig Piig,"Greentea Peng, Lava La Rue, Arlo Parks, Hope T..."
3,Gigi Perez,"Them & I, searows, Bon Iver & St. Vincent, Gra..."
4,Heartworms,"Nightbus, Deadletter, Dog Race, Chalk, Gurriers"


In [11]:
# Similar Artists to NMF Artists (Last.fm)
df_nmf_similar.head()

Unnamed: 0,Artist,Similar Artists
0,Rats On Rafts,"The Homesick, Nouveau Velo, Bombay Show Pig, N..."
1,Adwaith,"Buzzard Buzzard Buzzard, Melin Melyn, Gorky's ..."
2,"""Swept Away Original Broadway Cast,The Avett B...",
3,Krept & Konan,"Cadet, Giggs, Abra Cadabra, Yxng Bane, Kojo Funds"
4,Shannon Wright,"Elysian Fields, Scout Niblett, Kristin Hersh, ..."


> A quick reminder of the standard columns of a spotify export.

In [13]:
df_liked.columns

Index(['Track ID', 'Track Name', 'Album Name', 'Artist Name(s)',
       'Release Date', 'Duration (ms)', 'Popularity', 'Added By', 'Added At',
       'Genres', 'Record Label', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature'],
      dtype='object')

In [14]:
df_liked_similar.columns

Index(['Artist', 'Similar Artists'], dtype='object')

### Limit to One Song Per Album

In [16]:
# Check the original shape sizes before filtering
print(f"Original df_liked shape: {df_liked.shape}")
print(f"Original df_fav_albums shape: {df_fav_albums.shape}")
print(f"Original df_not_liked shape: {df_not_liked.shape}")

# Now apply the filtering for the most popular song per album
df_liked = df_liked.loc[df_liked.groupby('Album Name')['Popularity'].idxmax()]
df_fav_albums = df_fav_albums.loc[df_fav_albums.groupby('Album Name')['Popularity'].idxmax()]
df_not_liked = df_not_liked.loc[df_not_liked.groupby('Album Name')['Popularity'].idxmax()]

# Check the shape again after filtering
print(f"Filtered df_liked shape: {df_liked.shape}")
print(f"Filtered df_fav_albums shape: {df_fav_albums.shape}")
print(f"Filtered df_not_liked shape: {df_not_liked.shape}")

Original df_liked shape: (5546, 23)
Original df_fav_albums shape: (6124, 23)
Original df_not_liked shape: (831, 23)
Filtered df_liked shape: (3580, 23)
Filtered df_fav_albums shape: (524, 23)
Filtered df_not_liked shape: (74, 23)


### Add Target Labels for Training Feature

In [18]:
# Assign liked scores before combining
df_liked['liked'] = 100
df_fav_albums['liked'] = 65
df_not_liked['liked'] = 30
df_nmf['liked'] = 0

# Add playlist_origin column before combining
df_liked['playlist_origin'] = 'df_liked'
df_fav_albums['playlist_origin'] = 'df_fav_albums'
df_not_liked['playlist_origin'] = 'df_not_liked'
df_nmf['playlist_origin'] = 'df_nmf'
df_liked_similar['source'] = 'liked_similar'
df_nmf_similar['source'] = 'nmf_similar'

### Check application of the target encoding

In [20]:
df_liked[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
2315,100,df_liked
3192,100,df_liked
314,100,df_liked
3552,100,df_liked
3188,100,df_liked


In [21]:
df_fav_albums[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
620,65,df_fav_albums
5230,65,df_fav_albums
5720,65,df_fav_albums
1640,65,df_fav_albums
2405,65,df_fav_albums


In [22]:
df_not_liked[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
745,30,df_not_liked
232,30,df_not_liked
262,30,df_not_liked
545,30,df_not_liked
730,30,df_not_liked


In [23]:
df_nmf[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
0,0,df_nmf
1,0,df_nmf
2,0,df_nmf
3,0,df_nmf
4,0,df_nmf


In [24]:
df_liked_similar[['Artist', 'Similar Artists', 'source']].head()

Unnamed: 0,Artist,Similar Artists,source
0,Adrian Lyles,"Saylor Bell, Sofia Wylie, Julia Lester, Franki...",liked_similar
1,Michigander,"Wilderado, Joe P, Krooked Kings, GRMLN, Your N...",liked_similar
2,Biig Piig,"Greentea Peng, Lava La Rue, Arlo Parks, Hope T...",liked_similar
3,Gigi Perez,"Them & I, searows, Bon Iver & St. Vincent, Gra...",liked_similar
4,Heartworms,"Nightbus, Deadletter, Dog Race, Chalk, Gurriers",liked_similar


In [25]:
df_nmf_similar[['Artist', 'Similar Artists', 'source']].head()

Unnamed: 0,Artist,Similar Artists,source
0,Rats On Rafts,"The Homesick, Nouveau Velo, Bombay Show Pig, N...",nmf_similar
1,Adwaith,"Buzzard Buzzard Buzzard, Melin Melyn, Gorky's ...",nmf_similar
2,"""Swept Away Original Broadway Cast,The Avett B...",,nmf_similar
3,Krept & Konan,"Cadet, Giggs, Abra Cadabra, Yxng Bane, Kojo Funds",nmf_similar
4,Shannon Wright,"Elysian Fields, Scout Niblett, Kristin Hersh, ...",nmf_similar


## Merge The Datasets

In [27]:
df = pd.concat([df_liked, df_fav_albums, df_not_liked, df_nmf], ignore_index=True)

In [28]:
#How Large is the Dataset, Now?
df.shape

(4684, 25)

In [29]:
# Remove duplicates: Keep the highest 'liked' score (100 > 65)
df = df.sort_values(by='liked', ascending=False)  # Ensures 100-rated songs come first
df = df.drop_duplicates(subset=['Track Name', 'Artist Name(s)'], keep='first')
df.shape

(4058, 25)

In [30]:
df.columns #Checking to remind myself what is all available to drop, keep seperate as metadata, etc.

Index(['Track ID', 'Track Name', 'Album Name', 'Artist Name(s)',
       'Release Date', 'Duration (ms)', 'Popularity', 'Added By', 'Added At',
       'Genres', 'Record Label', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature', 'liked', 'playlist_origin'],
      dtype='object')

#### Drop columns that won't help the model (Track ID, Added By, Added At, Time Signature)

In [32]:
df.drop(columns=['Track ID', 'Added By', 'Added At', 'Time Signature'], inplace=True)

#### Handle missing values (if any)

In [34]:
 df.isna().sum()

Track Name             0
Album Name             0
Artist Name(s)         0
Release Date           0
Duration (ms)          0
Popularity             0
Genres              2026
Record Label           1
Danceability           0
Energy                 0
Key                    0
Loudness               0
Mode                   0
Speechiness            0
Acousticness           0
Instrumentalness       0
Liveness               0
Valence                0
Tempo                  0
liked                  0
playlist_origin        0
dtype: int64

In [35]:
#Drop all Missing Genres
df = df.dropna(subset=['Genres']).reset_index(drop=True)
df.shape

(2032, 21)

In [36]:
df['Record Label'] = df['Record Label'].fillna('Unknown')

## Target Encoding Record Labels

In [38]:
def target_encode(df, column, target, smoothing=1):
    # Separate out df_nmf to ensure it's never used in encoding
    df_train = df[df['playlist_origin'] != 'df_nmf'].copy()

    mean_target = df_train[target].mean()
    label_means = df_train.groupby(column)[target].mean()
    label_counts = df_train[column].value_counts()

    smoothed_values = (label_means * label_counts + mean_target * smoothing) / (label_counts + smoothing)

    # Map with a fallback to the overall mean
    df[column + '_encoded'] = df[column].map(smoothed_values).fillna(mean_target)

    return df

# Target encode only on the training data (excludes df_nmf)
df = target_encode(df, 'Record Label', 'liked', smoothing=10)
df[['Record Label', 'Record Label_encoded', 'liked']].head()


Unnamed: 0,Record Label,Record Label_encoded,liked
0,Stax,95.180723,100
1,Columbia,94.59885,100
2,"Chromeo Recordings, Inc.",94.304491,100
3,BMG Rights Management (US) LLC,93.018389,100
4,Rebel Group,94.304491,100


#### Rechecking for Nulls

In [40]:
 df.isna().sum()

Track Name              0
Album Name              0
Artist Name(s)          0
Release Date            0
Duration (ms)           0
Popularity              0
Genres                  0
Record Label            0
Danceability            0
Energy                  0
Key                     0
Loudness                0
Mode                    0
Speechiness             0
Acousticness            0
Instrumentalness        0
Liveness                0
Valence                 0
Tempo                   0
liked                   0
playlist_origin         0
Record Label_encoded    0
dtype: int64

In [41]:
df.columns

Index(['Track Name', 'Album Name', 'Artist Name(s)', 'Release Date',
       'Duration (ms)', 'Popularity', 'Genres', 'Record Label', 'Danceability',
       'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'liked',
       'playlist_origin', 'Record Label_encoded'],
      dtype='object')

### Genre's there are a LOT of them...

In [43]:
# Filter out 'df_nmf' from training data (so it doesn't influence target encoding)
df_train = df[df['playlist_origin'] != 'df_nmf'].copy()

# Calculate the overall mean of the target variable
global_mean = df_train['liked'].mean()

# Calculate the mean target value for each genre (using only non-'df_nmf' records)
genre_means = df_train.groupby('Genres')['liked'].mean()

# Count occurrences of each genre
genre_counts = df_train['Genres'].value_counts()

# Apply smoothing to avoid overfitting, especially for rare genres
smoothing_factor = 10
smoothed_values = (genre_means * genre_counts + global_mean * smoothing_factor) / (genre_counts + smoothing_factor)

# Apply encoding with fallback to global mean
df['Genre_Encoded'] = df['Genres'].map(smoothed_values).fillna(global_mean)

# Optional: Drop the original 'Genres' column to reduce dimensionality
df.drop(columns=['Genres'], inplace=True)

# Check the top rows of the updated dataframe
print(df[['Track Name', 'Album Name', 'Artist Name(s)', 'Genre_Encoded', 'liked', 'playlist_origin']].head(20))

                                  Track Name  \
0                                   Thirteen   
1                        Put Your Records On   
2                                6 Feet Away   
3   ooh la la (feat. Greg Nice & DJ Premier)   
4                               Can You Tell   
5                                       Dove   
6                                Radio Cloud   
7                                 Put You On   
8                                Fire Escape   
9                             Love Is a Rose   
10                                  Pristine   
11                               Ride or Die   
12               Let's Dance to Joy Division   
13     Purple Rain Freestyle (Game, Blouses)   
14             Psychic City - Classixx Remix   
15                      Everybody's Birthday   
16                        Gangsta's Paradise   
17                                Born Again   
18                         Time to Walk Away   
19                                  Stan

In [44]:
df.columns

Index(['Track Name', 'Album Name', 'Artist Name(s)', 'Release Date',
       'Duration (ms)', 'Popularity', 'Record Label', 'Danceability', 'Energy',
       'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'liked',
       'playlist_origin', 'Record Label_encoded', 'Genre_Encoded'],
      dtype='object')

## Having fun with Artist Names

In [46]:
# Clean artist names in main dataframe
df['Artist Name(s)'] = df['Artist Name(s)'].str.replace(r',(\S)', r', \1', regex=True)

# Clean liked_similar dataframe
df_liked_similar['Artist'] = df_liked_similar['Artist'].str.strip().str.lower()
df_liked_similar['Similar Artists'] = df_liked_similar['Similar Artists'].str.strip().str.lower()

# Clean nmf_similar dataframe
df_nmf_similar['Artist'] = df_nmf_similar['Artist'].str.strip().str.lower()
df_nmf_similar['Similar Artists'] = df_nmf_similar['Similar Artists'].str.strip().str.lower()


In [47]:
import networkx as nx
# Create a graph for liked artists and NMF artists
G_artists = nx.Graph()
# Add edges based on similar artists (from both df_liked_similar and df_nmf_similar)
for _, row in df_liked_similar.iterrows():
    G_artists.add_edge(row['Artist'], row['Similar Artists'])
for _, row in df_nmf_similar.iterrows():
    G_artists.add_edge(row['Artist'], row['Similar Artists'])  

In [97]:
# Get list of artists you've liked (exclude df_nmf playlist_origin)
liked_artists = df[df['playlist_origin'] != 'df_nmf']['Artist Name(s)'].str.strip().str.lower().unique()

# Let's verify our liked artists
print(f"Number of liked artists: {len(liked_artists)}")
print("Sample of liked artists:", liked_artists[:5])  # Show first 5 to verify

# NEW UPDATED FUNCTION that handles multiple artists
def artist_degree_detailed(artist, liked_artists, graph):
    artist_list = [a.strip().lower() for a in artist.split(',')]
    
    # Check if any artists are in the graph
    artists_in_graph = [a for a in artist_list if a in graph]
    if not artists_in_graph:
        return 'not_in_graph'
    
    # For artists in graph, get their degrees to liked artists
    all_degrees = []
    for single_artist in artists_in_graph:
        degrees = [nx.shortest_path_length(graph, source=liked_artist, target=single_artist)
                  for liked_artist in liked_artists 
                  if liked_artist in graph and nx.has_path(graph, liked_artist, single_artist)]
        if degrees:
            all_degrees.extend(degrees)
    
    if not all_degrees:
        return 'in_graph_no_path'
    return min(all_degrees)

# Apply new detailed function
df_nmf['Artist_Degree_Detailed'] = df_nmf['Artist Name(s)'].apply(
    lambda x: artist_degree_detailed(x, liked_artists, G_artists)
)

# Show new distribution
print("Detailed Degree Statistics:")
print(df_nmf['Artist_Degree_Detailed'].value_counts())

# Show some examples of each category
for category in df_nmf['Artist_Degree_Detailed'].unique():
    print(f"\nExamples of {category}:")
    print(df_nmf[df_nmf['Artist_Degree_Detailed'] == category][['Artist Name(s)', 'Track Name']].head(3))

Number of liked artists: 1244
Sample of liked artists: ['big star' 'ritt momney' 'chromeo'
 'run the jewels, el-p, killer mike, greg nice, dj premier' 'ra ra riot']
Detailed Degree Statistics:
Artist_Degree_Detailed
in_graph_no_path    192
0                    58
not_in_graph         36
2                     3
Name: count, dtype: int64

Examples of in_graph_no_path:
     Artist Name(s)       Track Name
1743       Diffrent  When I'm With U
1746       Diffrent           Warper
1747       Diffrent      Originality

Examples of not_in_graph:
                           Artist Name(s)       Track Name
1744  Traxman, Bobby Skillz, Sinjin Hawke       Kill Da DJ
1745                              Traxman  Trax Da Prophet
1752                     Traxman, DJ TWAN    Where They At

Examples of 0:
     Artist Name(s)                Track Name
1756   Wilder Woods             Kind Of Magic
1757   Wilder Woods          Devil In My Eyes
1758   Wilder Woods  Where Do We Go From Here

Examples of 2:
    

In [103]:
# Check some stats about our data
print(f"Number of liked artists: {len(liked_artists)}")
print(f"Number of nodes in graph: {len(G_artists.nodes())}")
print(f"Number of edges in graph: {len(G_artists.edges())}")

# Check if a few random liked artists are in the graph
sample_artists = liked_artists[:3]  # First 3 liked artists
for artist in sample_artists:
    print(f"Is '{artist}' in graph? {artist in G_artists}")

Number of liked artists: 1244
Number of nodes in graph: 4185
Number of edges in graph: 2157
Is 'big star' in graph? True
Is 'ritt momney' in graph? True
Is 'chromeo' in graph? True


In [105]:
# Check for some stats about the degrees
print("\nDegree Statistics:")
print(df_nmf['Artist_Degree'].value_counts().sort_index())

# Look at some examples across different degrees
for degree in sorted(df_nmf['Artist_Degree'].unique()):
    print(f"\nArtists with degree {degree}:")
    print(df_nmf[df_nmf['Artist_Degree'] == degree][['Artist Name(s)', 'Track Name']].head(3))


Degree Statistics:
Artist_Degree
0    286
2      3
Name: count, dtype: int64

Artists with degree 0:
                           Artist Name(s)       Track Name
1743                             Diffrent  When I'm With U
1744  Traxman, Bobby Skillz, Sinjin Hawke       Kill Da DJ
1745                              Traxman  Trax Da Prophet

Artists with degree 2:
                               Artist Name(s)       Track Name
1791                       Oklou, underscores      harvest sky
1819  FROMTHEHEART, Chuck Sutton, underscores  BRAND NEW SOUND
1826      FROMTHEHEART, brakence, underscores     TR1CK0RTR34T


In [101]:
# Get Lucy Dacus' neighbors
neighbors_lucy_dacus = list(G_artists.neighbors(lucy_dacus))
print(f"Neighbors of Lucy Dacus: {neighbors_lucy_dacus}")


Neighbors of Lucy Dacus: ['julien baker, boygenius, phoebe bridgers, better oblivion community center, leith ross']


In [107]:
# Keep existing degree calculation
df_nmf['Artist_Degree'] = df_nmf['Artist Name(s)'].apply(lambda x: artist_degree(x, liked_artists, G_artists))

# Add new detailed analysis
df_nmf['Artist_Degree_Detailed'] = df_nmf['Artist Name(s)'].apply(
    lambda x: artist_degree_detailed(x, liked_artists, G_artists)
)

# Show both distributions
print("Original Degree Distribution:")
print(df_nmf['Artist_Degree'].value_counts().sort_index())

print("\nDetailed Degree Distribution:")
print(df_nmf['Artist_Degree_Detailed'].value_counts())

# Compare for a few specific examples
print("\nComparison for specific tracks:")
print(df_nmf[['Track Name', 'Artist Name(s)', 'Artist_Degree', 'Artist_Degree_Detailed']].head(10))

Original Degree Distribution:
Artist_Degree
0    286
2      3
Name: count, dtype: int64

Detailed Degree Distribution:
Artist_Degree_Detailed
in_graph_no_path    192
0                    58
not_in_graph         36
2                     3
Name: count, dtype: int64

Comparison for specific tracks:
                   Track Name                       Artist Name(s)  \
1743          When I'm With U                             Diffrent   
1744               Kill Da DJ  Traxman, Bobby Skillz, Sinjin Hawke   
1745          Trax Da Prophet                              Traxman   
1746                   Warper                             Diffrent   
1747              Originality                             Diffrent   
1748                 My Sound                             Diffrent   
1749       Last Cards - Outro          Krept & Konan, Michael Ward   
1750  Back To Me - Krept Solo                        Krept & Konan   
1751                     Rage                Krept & Konan, Ghetts   
175

In [None]:
print(G_artists.nodes)


In [None]:
df.columns

In [None]:
import seaborn as sns

def clean_and_add_features(df, df_liked_similar, df_nmf_similar):
    """
    Clean dataframe and add network features properly
    """
    # Remove duplicate columns from merges
    columns_to_drop = ['source_x', 'source_y', 'Similar_Artists_x', 
                      'Similar_Artists_y', 'source', 'Similar_Artists']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Create artist network
    G_artists = nx.Graph()
    
    # Add edges from both similar artists dataframes
    for df_similar in [df_liked_similar, df_nmf_similar]:
        for _, row in df_similar.iterrows():
            artist = row['Artist'].lower().strip()
            similar = row['Similar_Artists'].lower().strip()
            G_artists.add_edge(artist, similar)
    
    # Get liked artists
    liked_artists = set(df[df['liked'] == 1]['Artist Name(s)'].str.lower().str.strip())
    
    def calculate_artist_features(artist):
        artist = artist.lower().strip()
        if artist not in G_artists:
            return pd.Series({
                'artist_min_degree': 999,  # High number for artists not in network
                'artist_avg_degree': 999,
                'artist_total_connections': 0,
                'artist_liked_connections': 0
            })
        
        # Calculate degrees to liked artists
        degrees = []
        for liked_artist in liked_artists:
            if liked_artist in G_artists and nx.has_path(G_artists, artist, liked_artist):
                degrees.append(nx.shortest_path_length(G_artists, artist, liked_artist))
        
        # Get direct connections
        neighbors = set(G_artists.neighbors(artist))
        liked_neighbors = len(neighbors.intersection(liked_artists))
        
        return pd.Series({
            'artist_min_degree': min(degrees) if degrees else 999,
            'artist_avg_degree': np.mean(degrees) if degrees else 999,
            'artist_total_connections': len(neighbors),
            'artist_liked_connections': liked_neighbors
        })
    
    # Add network features
    network_features = df['Artist Name(s)'].apply(calculate_artist_features)
    df = pd.concat([df, network_features], axis=1)
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Artist Network Features Distribution')
    
    # Plot distributions
    for idx, col in enumerate(['artist_min_degree', 'artist_avg_degree', 
                             'artist_total_connections', 'artist_liked_connections']):
        row = idx // 2
        col_idx = idx % 2
        sns.histplot(data=df[df[col] < 999], x=col, ax=axes[row, col_idx])
        axes[row, col_idx].set_title(col.replace('_', ' ').title())
    
    plt.tight_layout()
    
    return df, G_artists

## Standardize the numeric columns

In [None]:
# Define numeric columns explicitly (excluding 'artist_song_count')
numeric_cols = ['Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 
                'Speechiness', 'Acousticness', 'Instrumentalness', 
                'Liveness', 'Valence', 'Tempo']

# Ensure numeric columns exist in the dataframe before applying transformations
numeric_cols = [col for col in numeric_cols if col in df.columns]

# Ensure numeric columns don't contain NaNs before scaling
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(0)  # Replace any remaining NaNs with 0

# Standardize numerical features
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
#Viewing the numeric columns after standardization
df[numeric_cols]

### Seperate New Music Friday and Save it for Later!

# Data Dictionary with dtypes

| **Column Name**                    | **Description**                                                                                       | **dtype**    |
|-------------------------------------|-------------------------------------------------------------------------------------------------------|--------------|
| **Track Name**                      | The name of the track/song.                                                                          | object       |
| **Album Name**                      | The name of the album the track is part of.                                                          | object       |
| **Artist Name(s)**                  | The name(s) of the artist(s) performing the track.                                                   | object       |
| **Release Date**                    | The date the album or track was released.                                                            | object       |
| **Duration (ms)**                   | The duration of the track in milliseconds.                                                           | int64        |
| **Popularity**                      | A numerical value representing the track's popularity, typically based on streaming data.            | int64        |
| **Genres**                          | The primary genres associated with the track, usually a comma-separated list.                        | object       |
| **Danceability**                    | A value representing how suitable the track is for dancing, typically from 0 to 1.                    | float64      |
| **Energy**                          | A value representing the energy level of the track, typically from 0 to 1.                           | float64      |
| **Key**                             | The musical key of the track (C, D, etc.).                                                           | float64      |
| **Loudness**                        | The average loudness of the track in decibels (dB).                                                  | float64      |
| **Mode**                            | The mode of the track (Major/Minor).                                                                  | float64      |
| **Speechiness**                     | A value indicating the presence of spoken words in the track.                                        | float64      |
| **Acousticness**                    | A value indicating the amount of acoustic sound in the track.                                        | float64      |
| **Instrumentalness**                | A value indicating the likelihood that the track is instrumental.                                    | float64      |
| **Liveness**                        | A value representing the track's live performance aspect.                                            | float64      |
| **Valence**                         | A value indicating the track's musical positiveness or mood.                                         | float64      |
| **Tempo**                           | The tempo of the track in beats per minute (BPM).                                                    | float64      |
| **playlist_origin**                 | The source playlist of the track (e.g., 'df_liked', 'df_fav_albums', 'df_not_liked', 'df_nmf').       | object       |
| **liked**                           | The target variable indicating how much the user liked the track, ranging from 0 to 100.             | int64        |
| **Dead Oceans**                     | Binary indicator of whether the track is associated with the Dead Oceans label.                     | int64        |
| **Columbia**                        | Binary indicator of whether the track is associated with the Columbia label.                         | int64        |
| **Polyvinyl Records**               | Binary indicator of whether the track is associated with the Polyvinyl Records label.                | int64        |
| **Anti/Epitaph**                    | Binary indicator of whether the track is associated with the Anti/Epitaph label.                     | int64        |
| **Loma Vista Recordings**           | Binary indicator of whether the track is associated with the Loma Vista Recordings label.            | int64        |
| **Jagjaguwar**                      | Binary indicator of whether the track is associated with the Jagjaguwar label.                      | int64        |
| **Mom+Pop**                         | Binary indicator of whether the track is associated with the Mom+Pop label.                          | int64        |
| **Atlantic Records**                | Binary indicator of whether the track is associated with the Atlantic Records label.                 | int64        |
| **Sub Pop Records**                 | Binary indicator of whether the track is associated with the Sub Pop Records label.                  | int64        |
| **Warner Records**                  | Binary indicator of whether the track is associated with the Warner Records label.                   | int64        |
| **4AD**                             | Binary indicator of whether the track is associated with the 4AD label.                              | int64        |
| **Domino Recording Co**             | Binary indicator of whether the track is associated with the Domino Recording Co label.              | int64        |
| **Secretly Canadian**               | Binary indicator of whether the track is associated with the Secretly Canadian label.                | int64        |
| **Carpark Records**                 | Binary indicator of whether the track is associated with the Carpark Records label.                  | int64        |
| **BMG Rights Management (US) LLC**  | Binary indicator of whether the track is associated with the BMG Rights Management label.            | int64        |
| **ATO Records**                     | Binary indicator of whether the track is associated with the ATO Records label.                      | int64        |
| **Nettwerk Music Group**            | Binary indicator of whether the track is associated with the Nettwerk Music Group label.             | int64        |
| **New West Records**                | Binary indicator of whether the track is associated with the New West Records label.                 | int64        |
| **Double Double Whammy**            | Binary indicator of whether the track is associated with the Double Double Whammy label.             | int64        |
| **Saddle Creek**                    | Binary indicator of whether the track is associated with the Saddle Creek label.                     | int64        |
| **Genres_1**                        | A sub-genre or another classification of the track, typically the first in a set.                     | object       |
| **Genres_2**                        | A second sub-genre or classification of the track.                                                    | object       |
| **Genres_3**                        | A third sub-genre or classification of the track.                                                     | object       |
| **Bump_Genre_1**                    | A genre from the bumped (adjusted) genres list, typically the first genre.                           | bool         |
| **Bump_Genre_2**                    | A second genre from the bumped (adjusted) genres list.                                                | bool         |
| **Bump_Genre_3**                    | A third genre from the bumped (adjusted) genres list.                                                 | bool         |
| **Genre_Bump_Count**                | A count of how many times the genre has been bumped (adjusted) based on user input or algorithm.      | int64        |
| **Genre_Bump**                      | A cumulative score or adjustment for the genre based on user feedback or algorithmic bumping.         | int32        |
| **Genre_Bump_Score**                | A final score for the genre after all adjustments, typically used for recommendations or predictions.  | int64        |


## Clean UP the Columns to use and Not Use for the Machine Learning

In [None]:
# Drop unnecessary columns
df_cleaned = df.drop(columns=['Track Name', 'Album Name', 'Artist Name(s)', 
                              'Genres', 'Genres_1', 'Genres_2', 'Genres_3', 
                              'Popularity', 'Bump_Genre_2', 'Bump_Genre_3',
                              'Genre_Bump_Count', 'Genre_Bump', 'Genre_Bump_Score',
                              'liked', 'Release Date', 'playlist_origin',
                              'Duration (ms)'])

# Prepare the feature matrix and target variable'
X = df_cleaned
y = df['liked']  # target variable

# The data is now ready for regression modeling


In [None]:
# Separate the NMF data from the rest of the data
df_nmf_separated = df_cleaned[df['playlist_origin'] == 'df_nmf']
df_training_data = df_cleaned[df['playlist_origin'] != 'df_nmf']


## Linear Regression (Simple Model)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression
lr = LinearRegression()
lr.fit(X, y)

# Predictions
y_pred_lr = lr.predict(X)

# Metrics
mse_lr = mean_squared_error(y, y_pred_lr)
r2_lr = r2_score(y, y_pred_lr)

print(f"Linear Regression MSE: {mse_lr}")
print(f"Linear Regression R2: {r2_lr}")


## Ridge Regression (REgularized Linear Regression)

In [None]:
from sklearn.linear_model import Ridge

# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)

# Predictions
y_pred_ridge = ridge.predict(X)

# Metrics
mse_ridge = mean_squared_error(y, y_pred_ridge)
r2_ridge = r2_score(y, y_pred_ridge)

print(f"Ridge Regression MSE: {mse_ridge}")
print(f"Ridge Regression R2: {r2_ridge}")


## Lasso Regression (L1 Regularization)

In [None]:
from sklearn.linear_model import Lasso

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)

# Predictions
y_pred_lasso = lasso.predict(X)

# Metrics
mse_lasso = mean_squared_error(y, y_pred_lasso)
r2_lasso = r2_score(y, y_pred_lasso)

print(f"Lasso Regression MSE: {mse_lasso}")
print(f"Lasso Regression R2: {r2_lasso}")

## Random Forest Regressor (Ensemble Model)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X, y)

# Predictions
y_pred_rf = rf.predict(X)

# Metrics
mse_rf = mean_squared_error(y, y_pred_rf)
r2_rf = r2_score(y, y_pred_rf)

print(f"Random Forest MSE: {mse_rf}")
print(f"Random Forest R2: {r2_rf}")


## Gradient Boosting Regressor (Advanced Ensemble Method)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor
gb = GradientBoostingRegressor(n_estimators=100)
gb.fit(X, y)

# Predictions
y_pred_gb = gb.predict(X)

# Metrics
mse_gb = mean_squared_error(y, y_pred_gb)
r2_gb = r2_score(y, y_pred_gb)

print(f"Gradient Boosting MSE: {mse_gb}")
print(f"Gradient Boosting R2: {r2_gb}")


## Get the Top 15 Features

In [None]:
# Get feature importances for Random Forest and Gradient Boosting
rf_feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
gb_feature_importances = pd.Series(gb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Top 15 features for Random Forest
print("Top 15 Random Forest Features:")
print(rf_feature_importances.head(15))

# Top 15 features for Gradient Boosting
print("Top 15 Gradient Boosting Features:")
print(gb_feature_importances.head(15))


In [None]:
# Focus on the bump-related columns
bump_columns = ['Bump_Genre_1', 'Bump_Genre_2', 'Bump_Genre_3', 'Genre_Bump_Count', 'Genre_Bump', 'Genre_Bump_Score']

# Include 'liked' in the correlation check
df_bumps = df[bump_columns + ['liked']]

# Compute the correlation matrix for bump-related columns
bump_correlation = df_bumps.corr()

# Get the correlation values with 'liked'
liked_bump_correlation = bump_correlation['liked'].sort_values(ascending=False)

# Show the top correlated bump features with 'liked'
print(liked_bump_correlation)


## Testing against a completely different dataset of new music!

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Keep 'liked' only for the training data, ignore for NMF songs
df_cleaned = df.drop(columns=['Track Name', 'Album Name', 'Artist Name(s)', 
                              'Genres', 'Genres_1', 'Genres_2', 'Genres_3', 
                              'Bump_Genre_2', 'Bump_Genre_3', 'Popularity',
                               'Genre_Bump', 'Genre_Bump_Score',
                              'Release Date', 'playlist_origin'])  # Exclude non-numeric columns

# Separate the NMF data from the rest of the data
df_nmf_separated = df[df['playlist_origin'] == 'df_nmf']
df_training_data = df[df['playlist_origin'] != 'df_nmf']

# Prepare the feature matrix and target variable for training (keep only numeric columns)
X_train = df_training_data.select_dtypes(include=['number']).drop(columns=['liked'])  # Remove 'liked' from training features
y_train = df_training_data['liked']  # 'liked' is the target for training

# Train the Random Forest model on the training data
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

# Prepare the feature matrix for prediction using NMF data (only numeric columns)
X_nmf = df_nmf_separated.select_dtypes(include=['number']).drop(columns=['liked'])  # No 'liked' column in the prediction set

# Predict the 'liked' scores for NMF data (without interfering with the 0s)
y_pred_nmf = rf.predict(X_nmf)

# Add the predicted scores to the NMF dataset
df_nmf_separated['Predicted_Liked_Score'] = y_pred_nmf

# Group by album name only and aggregate predicted scores
aggregated_album_scores = df_nmf_separated.groupby('Album Name').agg(
    AggregatePredictedAverage=('Predicted_Liked_Score', 'mean'),  # Take the mean of predicted scores
    Artists=('Artist Name(s)', 'first'),  # Get the first artist for the album (you could also choose a more robust method if needed)
    ReleaseDate=('Release Date', 'first')  # You could add other fun data like release date
).reset_index()

# Display the head of the aggregated results
print(aggregated_album_scores.head(20))

# Optionally, save the aggregated scores to a CSV
aggregated_album_scores.to_csv('recommended_albums.csv', index=False)

# If you want to evaluate the Random Forest model on the training data
y_pred_rf_train = rf.predict(X_train)
mse_rf = mean_squared_error(y_train, y_pred_rf_train)
r2_rf = r2_score(y_train, y_pred_rf_train)

print(f"Random Forest MSE (Training): {mse_rf}")
print(f"Random Forest R2 (Training): {r2_rf}")


In [None]:
import matplotlib.pyplot as plt

# Visualize the distribution of predicted 'liked' scores for NMF data
plt.figure(figsize=(10, 6))
plt.hist(df_nmf_separated['Predicted_Liked_Score'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Predicted Liked Scores (NMF)', fontsize=16)
plt.xlabel('Predicted Liked Score', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(True)
plt.show()


## Classification Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print(classification_report(y_test, y_pred_rf))


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbc.fit(X_train, y_train)

y_pred_gbc = gbc.predict(X_test)

print(classification_report(y_test, y_pred_gbc))


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print(classification_report(y_test, y_pred_knn))
