# Song Recommender

**Author**: Jia Desai

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

songs_df = pd.read_csv('Spotify 2010 - 2019 Top 100.csv')

In [4]:
#Features of the dataset
songs_df.columns

Index(['title', 'artist', 'top genre', 'year released', 'added', 'bpm', 'nrgy',
       'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch', 'pop', 'top year',
       'artist type'],
      dtype='object')

In [5]:
# cleaning the data
# renaming columns for better understanding
new_col_names = {
    'nrgy': "energy", 
    'dnce': 'danceability', 
    'dB': 'decibel/loudness', 
    'val': 'positivity', 
    'dur': 'duration', 
    'spch': 'spoken word', 
    'pop': 'popularity'
}
songs_df_copy = songs_df.rename(columns = new_col_names)

# dropping columns which will not be used by system
col_to_drop = ['added', 'live', 'artist type', 'top year']
songs_df_copy = songs_df_copy.drop(columns = col_to_drop)

#removing na values
songs_df_copy = songs_df_copy.dropna()

#adding a featuring column to use for recommendation
songs_df_copy['featuring'] = songs_df_copy['title'].str.split('.').str[1]
songs_df_copy['featuring'] = songs_df_copy['featuring'].str.replace(')', '')

#removing the featuring artist/s from the song title
songs_df_copy['title'] = songs_df_copy['title'].str.replace(r'\s*\(feat\..*?\)', '', regex=True)

songs_df.head()

Unnamed: 0,title,artist,top genre,year released,added,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,top year,artist type
0,STARSTRUKK (feat. Katy Perry),3OH!3,dance pop,2009.0,2022‑02‑17,140.0,81.0,61.0,-6.0,23.0,23.0,203.0,0.0,6.0,70.0,2010.0,Duo
1,My First Kiss (feat. Ke$ha),3OH!3,dance pop,2010.0,2022‑02‑17,138.0,89.0,68.0,-4.0,36.0,83.0,192.0,1.0,8.0,68.0,2010.0,Duo
2,I Need A Dollar,Aloe Blacc,pop soul,2010.0,2022‑02‑17,95.0,48.0,84.0,-7.0,9.0,96.0,243.0,20.0,3.0,72.0,2010.0,Solo
3,Airplanes (feat. Hayley Williams of Paramore),B.o.B,atl hip hop,2010.0,2022‑02‑17,93.0,87.0,66.0,-4.0,4.0,38.0,180.0,11.0,12.0,80.0,2010.0,Solo
4,Nothin' on You (feat. Bruno Mars),B.o.B,atl hip hop,2010.0,2022‑02‑17,104.0,85.0,69.0,-6.0,9.0,74.0,268.0,39.0,5.0,79.0,2010.0,Solo


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [7]:
# Text vectorization (TF-IDF) for 'title', 'artist', 'top genre'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf_vectorizer.fit_transform(songs_df_copy['title'])
tfidf_matrix_artist = tfidf_vectorizer.fit_transform(songs_df_copy['artist'])
tfidf_matrix_genre = tfidf_vectorizer.fit_transform(songs_df_copy['top genre'])

# Combine TF-IDF matrices with numerical features
numerical_features = ['year released', 'bpm', 'energy', 'danceability', 'decibel/loudness',
                      'positivity', 'duration', 'acous', 'spoken word', 'popularity']
features_matrix = pd.concat([songs_df_copy[numerical_features],
                             pd.DataFrame(tfidf_matrix_title.toarray()),
                             pd.DataFrame(tfidf_matrix_artist.toarray()),
                             pd.DataFrame(tfidf_matrix_genre.toarray())], axis=1)

# Compute cosine similarity
cosine_sim = cosine_similarity(features_matrix, features_matrix)

# Function to recommend songs based on similarity
def recommend_songs(song_title, cosine_sim, songs_df_copy, top_n=5):
    song_index = songs_df_copy[songs_df_copy['title'] == song_title].index[0]
    sim_scores = list(enumerate(cosine_sim[song_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_sim_scores = sim_scores[1:top_n+1]  # Exclude the song itself
    recommended_song_indices = [i[0] for i in top_sim_scores]
    return songs_df_copy.iloc[recommended_song_indices]['title']

# Example: Recommend songs similar to 'Song A'
recommended_songs = recommend_songs('Lean On', cosine_sim, songs_df_copy)
print("Recommended Songs:")
print(recommended_songs)

Recommended Songs:
3      Airplanes
146      It Girl
29        Replay
491       Wasted
997       Boasty
Name: title, dtype: object
