In [1]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error,precision_score
import pickle

In [2]:
# Load dataset
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,The Monkees,If You Have The Time,If you have the time would you keep me in mind...
1,John Mellencamp,The Kind Of Fella I Am,Well I don't like it when I see your eyes dart...
2,Talking Heads,Heaven,Everyone is trying to get to the bar. \r\nThe...
3,Emmylou Harris,If You Were A Bluebird,If you were a bluebird you'd be a sad one \r\...
4,Kris Kristofferson,Killing Time,See that long line of people who keep standing...
5,Hanson,Great Divide,The earth is shaking under siege \r\nAnd ever...
6,Phish,The Moma Dance,And all throughout I gaze and glimpse you \r\...
7,"Harry Connick, Jr.",Heavenly,He's always smiling \r\nHe never looks mean ...
8,Kris Kristofferson,Stranger,Maybe she was smilin' in the mirror \r\nMaybe...
9,Kenny Rogers,Makes Me Wonder If I Ever Said Goodbye,Makes me wonder if I ever said goodbye \r\nSh...


Text Cleaning/ Text Preprocessing

In [9]:
# Lowercase the text and remove special characters
df['text'] = df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', ' ',regex = True)

In [10]:
# Stemming with NLTK
stemmer = PorterStemmer()

In [11]:
# Tokenize function
def token(text):
    words = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word) for word in words])

In [12]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [13]:
# Apply tokenization
df['text'] = df['text'].apply(lambda x: token(x))

In [14]:
# TF-IDF Vectorizer for text representation
tfid = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfid.fit_transform(df['text'])

In [15]:
# Cosine similarity matrix
similar = cosine_similarity(matrix)

Recommender Function

In [16]:
def recommender(song_name):
    # Check if the song exists in the dataset
    if song_name not in df['song'].values:
        return f"Song '{song_name}' not found in the dataset."
    
    # Get the index of the song
    idx = df[df['song'] == song_name].index[0]
    
    # Calculate the similarity distances for the song
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x: x[1])
    
    # Recommend top 4 similar songs
    song_recommendations = []
    for s_id in distance[1:5]:  # Exclude the first one (it's the same song)
        song_recommendations.append(df.iloc[s_id[0]].song)
    
    return song_recommendations

In [19]:
print(recommender("Heavenly"))

["I'll Never Smile Again", 'Keep Smiling', 'Girl On The Moon', 'Smile Again']


In [21]:
# Test the recommender system
print(recommender("Stranger"))

['Ale', 'Stranger Within', 'The New Maybe', 'Stranger Than The Stranger On The Shore']


In [22]:
pickle.dump(similar,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [23]:
# Load saved models (for testing purposes)
loaded_similarity = pickle.load(open("similarity.pkl", "rb"))
loaded_df = pickle.load(open("df.pkl", "rb"))

In [24]:
# Performance evaluation - Accuracy for recommendations
def recommendation_accuracy():
    # Splitting the dataset into train and test sets
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    
    predictions = []
    actual = []
    
    for i in range(len(test_data)):
        song_name = test_data.iloc[i]['song']
        actual.append(song_name)  # Correct song is the actual value
        try:
            rec_songs = recommender(song_name)
            predictions.append(rec_songs[0])  # Taking the first recommended song as the prediction
        except:
            predictions.append(song_name)  # If song not found, predict itself

    # Calculate accuracy by comparing actual songs with predicted ones
    correct_predictions = sum([1 for a, p in zip(actual, predictions) if a == p])
    accuracy = correct_predictions / len(predictions)
    
    return accuracy

# Calculate accuracy for the recommendation system
accuracy_value = recommendation_accuracy()
print(f"Accuracy for the recommendation system: {accuracy_value}")

Accuracy for the recommendation system: 0.019


In [26]:
from sklearn.model_selection import train_test_split
import numpy as np

def recommendation_metrics(k=5):
    # Splitting the dataset into train and test sets
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    
    actual = []
    predictions = []
    
    for i in range(len(test_data)):
        song_name = test_data.iloc[i]['song']
        actual.append(song_name)  # Actual song is the ground truth
        
        try:
            rec_songs = recommender(song_name)  # Get a list of recommended songs
            predictions.append(rec_songs[:k])   # Take the top K recommended songs
        except:
            predictions.append([song_name])  # If no recommendations, predict the song itself
    
    # Calculate Precision@K and MRR
    precision_at_k = 0
    mrr = 0
    
    for i in range(len(test_data)):
        actual_song = actual[i]
        predicted_songs = predictions[i]
        
        # Calculate Precision@K (whether actual song is in the top K recommendations)
        if actual_song in predicted_songs:
            precision_at_k += 1
        
        # Calculate MRR (reciprocal rank for the actual song)
        if actual_song in predicted_songs:
            rank = predicted_songs.index(actual_song) + 1  # Rank is 1-based
            mrr += 1 / rank

    # Normalize results
    precision_at_k /= len(test_data)
    mrr /= len(test_data)
    
    return precision_at_k, mrr

# Calculate metrics for the recommendation system
precision_value, mrr_value = recommendation_metrics(k=5)
print(f"Precision@5 for the recommendation system: {precision_value}")
print(f"Mean Reciprocal Rank (MRR) for the recommendation system: {mrr_value}")


Precision@5 for the recommendation system: 0.027
Mean Reciprocal Rank (MRR) for the recommendation system: 0.022166666666666668
