In [78]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

In [79]:
data_path = "music_data.csv"
df = pd.read_csv(data_path)


In [80]:
df.head(5)

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:0vNPJrUrBnMFdCs8b2MTNG,Fader,spotify:artist:4W48hZAnAHVOC2c8WH8pcq,The Temper Trap,spotify:album:0V59MMtgoruvEqMv18KAOH,Conditions (Tour Edition),spotify:artist:4W48hZAnAHVOC2c8WH8pcq,The Temper Trap,2009,https://i.scdn.co/image/ab67616d0000b273f86ae8...,...,0.0353,0.000101,0.69,0.0752,0.158,134.974,4.0,,Liberation Records,"C 2010 Liberation Music, P 2010 Liberation Music"
1,spotify:track:0NpvdCO506uO58D4AbKzki,Sherry,spotify:artist:6mcrZQmgzFGRWf7C0SObou,Frankie Valli & The Four Seasons,spotify:album:0NUEQILaBzavnzcMEs4buZ,The Very Best of Frankie Valli & The 4 Seasons,spotify:artist:6mcrZQmgzFGRWf7C0SObou,Frankie Valli & The Four Seasons,2003-01-14,https://i.scdn.co/image/ab67616d0000b273b96c21...,...,0.0441,0.626,0.0,0.113,0.734,117.562,4.0,,Rhino,C © 2004 Bob Gaudio & Frankie Valli d/b/a The ...
2,spotify:track:1MtUq6Wp1eQ8PC6BbPCj8P,I Took A Pill In Ibiza - Seeb Remix,"spotify:artist:2KsP6tYLJlTBvSUxnwlVWa, spotify...","Mike Posner, Seeb",spotify:album:1Tz3Ai1guEFf4hV3d9i17K,"At Night, Alone.",spotify:artist:2KsP6tYLJlTBvSUxnwlVWa,Mike Posner,2016-05-06,https://i.scdn.co/image/ab67616d0000b273a19be7...,...,0.111,0.0353,8e-06,0.0843,0.71,101.969,4.0,,"Monster Mountain, LLC / Island","C © 2016 Island Records, a division of UMG Rec..."
3,spotify:track:59lq75uFIqzUZcgZ4CbqFG,Let Go for Tonight,spotify:artist:7qRll6DYV06u2VuRPAVqug,Foxes,spotify:album:5AQ7uKRSpAv7SNUl4j24ru,Glorious (Deluxe),spotify:artist:7qRll6DYV06u2VuRPAVqug,Foxes,2014-05-12,https://i.scdn.co/image/ab67616d0000b273ae5c7d...,...,0.0632,0.0429,2e-06,0.326,0.299,140.064,4.0,,Sign Of The Times Records,P (P) 2014 Sign Of The Times Limited under exc...
4,spotify:track:7KdcZQ3GJeGdserhK61kfv,The Way I Want To Touch You,spotify:artist:7BEfMxbaqx6dOpbtlEqScm,Captain & Tennille,spotify:album:3GUxesVyOehInaxJyCTh6d,Love Will Keep Us Together,spotify:artist:7BEfMxbaqx6dOpbtlEqScm,Captain & Tennille,1975-01-01,https://i.scdn.co/image/ab67616d0000b273e21a28...,...,0.0248,0.624,0.000112,0.343,0.597,111.29,4.0,,A&M,"C © 1975 A&M Records, P This Compilation ℗ 197..."


In [81]:
# Process numeric features
numerical_features = ['Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time Signature']
for feature in numerical_features:
    df[feature] = df[feature].fillna(0)
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Process text features
text_features = ['Track Name', 'Artist Name(s)', 'Album Name', 'Album Genres']
for feature in text_features:
    df[feature] = df[feature].fillna('').str.lower()

df['combined_text'] = df['Track Name'] + " " + df['Artist Name(s)'] + " " + df['Album Name'] + " " + df['Album Genres']

In [82]:
df.head(1)

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights,combined_text
0,spotify:track:0vNPJrUrBnMFdCs8b2MTNG,fader,spotify:artist:4W48hZAnAHVOC2c8WH8pcq,the temper trap,spotify:album:0V59MMtgoruvEqMv18KAOH,conditions (tour edition),spotify:artist:4W48hZAnAHVOC2c8WH8pcq,The Temper Trap,2009,https://i.scdn.co/image/ab67616d0000b273f86ae8...,...,-0.835885,5.33559,-0.741442,-1.789252,0.512242,0.158335,,Liberation Records,"C 2010 Liberation Music, P 2010 Liberation Music",fader the temper trap conditions (tour edition)


In [83]:
def build_model(df, numerical_features):
   #Build a combined feature matrix using text and numerical features.
    # Vectorize text data
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    text_matrix = vectorizer.fit_transform(df['combined_text'])

    # Combine text and numerical features
    numeric_matrix = csr_matrix(df[numerical_features].values)
    combined_matrix = hstack([text_matrix, numeric_matrix])

    return combined_matrix

In [84]:
def evaluate_model(df, feature_matrix):
    #Evaluate the model by measuring the average similarity of recommended songs.
    scores = []
    for idx in range(feature_matrix.shape[0]):
        cosine_sim = cosine_similarity(feature_matrix[idx], feature_matrix)
        sim_scores = sorted(cosine_sim[0], reverse=True)[1:11]
        scores.append(np.mean(sim_scores))
    return np.mean(scores)

In [85]:
feature_matrix = build_model(df, numerical_features)
print("Evaluating the model...")
avg_similarity = evaluate_model(df, feature_matrix)
print(f"Average similarity score of top-10 recommendations: {avg_similarity:.2f}")

Evaluating the model...
Average similarity score of top-10 recommendations: 0.79


In [86]:
def recommend_songs(song_title, df, feature_matrix):
    #Recommend similar songs based on the given song title.
    try:
        idx = df[df['Track Name'].str.lower() == song_title.lower()].index[0]
    except IndexError:
        return f"Song titled '{song_title}' not found in the dataset."

    cosine_sim = cosine_similarity(feature_matrix[idx], feature_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i, (song_idx, score) in enumerate(sim_scores[1:11]):
        song_data = {
            "rank": i + 1,
            "Track Name": df.iloc[song_idx]['Track Name'],
            "Artist Name(s)": df.iloc[song_idx]['Artist Name(s)'],
            "similarity_score": round(score, 2)
        }
        recommendations.append(song_data)

    return recommendations

In [87]:
print("\nInput a song title for recommendations:")
user_input = input("Song title: ")
recommendations = recommend_songs(user_input, df, feature_matrix)

if isinstance(recommendations, str):
    print(recommendations)
else:
    print("\nTop 10 song recommendations:")
    for song in recommendations:
        print(f"Rank {song['rank']}: {song['Track Name']} by {song['Artist Name(s)']} (Score: {song['similarity_score']})")


Input a song title for recommendations:
Song title: Starboy

Top 10 song recommendations:
Rank 1: she will be loved by maroon 5 (Score: 0.88)
Rank 2: every time you cry (with human nature) - remastered by john farnham, human nature (Score: 0.88)
Rank 3: pictures of you by the last goodnight (Score: 0.88)
Rank 4: i wanna sex you up by color me badd (Score: 0.87)
Rank 5: do it like that by ricki-lee (Score: 0.87)
Rank 6: scars to your beautiful by alessia cara (Score: 0.87)
Rank 7: down (feat. gucci mane) by fifth harmony, gucci mane (Score: 0.87)
Rank 8: i wanna sex you up - single mix by color me badd (Score: 0.87)
Rank 9: missing you by john waite (Score: 0.87)
Rank 10: chained to the rhythm by katy perry, skip marley (Score: 0.87)
