In [1]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error,precision_score
import pickle

In [2]:
# Load dataset
df = pd.read_csv("songs_data.csv")

In [3]:
df.head()

Unnamed: 0,artist,song,link,text,Sentiment,Popularity
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",0.447619,42
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",0.202222,50
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,0.300881,35
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,0.355,98
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,0.355,86


In [4]:
df.tail()

Unnamed: 0,artist,song,link,text,Sentiment,Popularity
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...,0.358824,9
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...,-0.131289,36
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...,0.129481,61
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...,0.32,25
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...,0.038304,65


In [5]:
df.shape

(57650, 6)

In [6]:
df.isnull().sum()

artist        0
song          0
link          0
text          0
Sentiment     0
Popularity    0
dtype: int64

In [7]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text,Sentiment,Popularity
0,Kirk Franklin,Without You,I can take a plane high up in the sky and fly ...,0.321467,95
1,Oingo Boingo,Sweat,Sweat! \r\nSweat! \r\nSweat! \r\nBorn for t...,0.113915,12
2,Townes Van Zandt,My Proud Mountains,My home is Colorado \r\nWith their proud moun...,0.076548,90
3,Judy Garland,The Trolley Song,With my high starched-collar and my high-toppe...,0.300256,19
4,Ella Fitzgerald,After You've Gone,After you've gone and left me crying \r\nAfte...,-0.005357,82
5,Frank Zappa,Cucamonga,"Frank zappa (lead guitar, vocals) \r\nCaptain...",-0.010714,73
6,Lata Mangeshkar,Kaha The Aap,(Kaha the aap zamane ke baad aaye hai - 2 \r\...,-0.5,38
7,Quicksilver Messenger Service,Gone Again,"You ask me why my mind goes to wanderin', \r\...",-0.047222,35
8,Hank Snow,In The Misty Moonlight,In the misty moonlight by the flickering firel...,0.193961,73
9,Who,Melancholia,"My coffee's cold, my paper's old, \r\nMy hear...",-0.272098,29


In [9]:
df['Combined'] = (
    df['artist'] + ' - ' + df['song'] + ': ' + df['text'] )

 Text Preprocessing

In [10]:
# Lowercase the text and remove special characters
df['Combined'] = df['Combined'].str.lower().replace(r'^\w\s', '').replace(r'\n', ' ',regex = True)

In [11]:
# Stemming with NLTK
stemmer = PorterStemmer()

In [12]:
# Tokenize function
def token(Combined):
    words = nltk.word_tokenize(Combined)
    return " ".join([stemmer.stem(word) for word in words])

In [13]:
# Apply tokenization
df['Combined'] = df['Combined'].apply(lambda x: token(x))

### TF-IDF Vectorizer for textual content combination representation


In [14]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfid.fit_transform(df['Combined'])

In [15]:
# Cosine similarity matrix
similar = cosine_similarity(matrix)

In [16]:
similar

array([[1.        , 0.02994381, 0.06957776, ..., 0.01320272, 0.00898276,
        0.03832247],
       [0.02994381, 1.        , 0.03488055, ..., 0.01556427, 0.00250492,
        0.02597571],
       [0.06957776, 0.03488055, 1.        , ..., 0.01590864, 0.01363007,
        0.0251882 ],
       ...,
       [0.01320272, 0.01556427, 0.01590864, ..., 1.        , 0.00292292,
        0.0332055 ],
       [0.00898276, 0.00250492, 0.01363007, ..., 0.00292292, 1.        ,
        0.00659274],
       [0.03832247, 0.02597571, 0.0251882 , ..., 0.0332055 , 0.00659274,
        1.        ]])

### Cosine similarity for numerical data


In [17]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [18]:
#Numerical features
numerical_features = df[['Sentiment', 'Popularity']]

In [19]:
# Normalize the numerical features
scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(numerical_features)


In [20]:
numerical_similarity = cosine_similarity(normalized_numerical)


In [21]:
numerical_similarity

array([[1.        , 0.74185858, 0.9974706 , ..., 0.9873318 , 0.99807638,
        0.74091785],
       [0.74185858, 1.        , 0.69231881, ..., 0.83885723, 0.78200349,
        0.99999902],
       [0.9974706 , 0.69231881, 1.        , ..., 0.97355619, 0.99114514,
        0.69130658],
       ...,
       [0.9873318 , 0.83885723, 0.97355619, ..., 1.        , 0.99526944,
        0.83809331],
       [0.99807638, 0.78200349, 0.99114514, ..., 0.99526944, 1.        ,
        0.781129  ],
       [0.74091785, 0.99999902, 0.69130658, ..., 0.83809331, 0.781129  ,
        1.        ]])

### Combine the textual and numerical cosine similarities (by averaging)


In [36]:
combined_cosine_sim = (numerical_similarity + similar) / 2

In [37]:
similar=combined_cosine_sim

In [38]:
similar

array([[1.        , 0.56387989, 0.76549739, ..., 0.74379953, 0.75080298,
        0.565269  ],
       [0.56387989, 1.        , 0.52795925, ..., 0.63303399, 0.58712885,
        0.75649319],
       [0.76549739, 0.52795925, 1.        , ..., 0.7341443 , 0.74676638,
        0.52477699],
       ...,
       [0.74379953, 0.63303399, 0.7341443 , ..., 1.        , 0.74718281,
        0.63687136],
       [0.75080298, 0.58712885, 0.74676638, ..., 0.74718281, 1.        ,
        0.58749493],
       [0.565269  , 0.75649319, 0.52477699, ..., 0.63687136, 0.58749493,
        1.        ]])

### Recommender Function

In [25]:
def recommender(song_name):
    # Check if the song exists in the dataset
    if song_name not in df['song'].values:
        return f"Song '{song_name}' not found in the dataset."
    
    # Get the index of the song
    idx = df[df['song'] == song_name].index[0]
    
    # Calculate the similarity distances for the song
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x: x[1])
    
    # Recommend top 4 similar songs
    song_recommendations = []
    for s_id in distance[1:5]:  # Exclude the first one (it's the same song)
        song_recommendations.append(df.iloc[s_id[0]].song)
    
    return song_recommendations

In [27]:
print(recommender("Without You"))

['If You Walked Away', 'Lost', 'Child Of Mine', 'My Guitar Lies Bleeding In My Arms']


In [29]:
# Test the recommender system
print(recommender("My Proud Mountains"))

['Mountain Woman', 'The Bear Went Over The Mountain', 'How Long, How Long Blues', 'Go Tell It On The Mountain']


In [30]:
pickle.dump(similar,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [31]:
# Load saved models (for testing purposes)
loaded_similarity = pickle.load(open("similarity.pkl", "rb"))
loaded_df = pickle.load(open("df.pkl", "rb"))

### Evaluation Using MAP

In [32]:
def get_relevant_songs(song_name, top_n=5):
    
    #Get the top N most similar songs as the relevant set.
    
    if song_name not in df['song'].values:
        return []
    
    # Get the index of the song
    idx = df[df['song'] == song_name].index[0]
    
    # Calculate the similarity distances for the song
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x: x[1])
    
    # Top N relevant songs (excluding the song itself)
    relevant_songs = [df.iloc[s_id[0]].song for s_id in distance[1:top_n+1]]
    
    return set(relevant_songs)



    Calculate Average Precision for one recommendation query.
    recommended_songs: List of recommended songs.
    relevant_songs: Set of relevant songs (derived from similarity).
    

In [33]:
def average_precision(recommended_songs, relevant_songs):
    
    hits = 0
    precision_at_k = []
    
    for i, song in enumerate(recommended_songs):
        if song in relevant_songs:
            hits += 1
            precision_at_k.append(hits / (i + 1))  # Precision at rank k
    
    if not precision_at_k:
        return 0.0
    
    return sum(precision_at_k) / len(relevant_songs)


Calculate Mean Average Precision (MAP) for multiple queries.
all_recommendations: List of lists, where each sublist is a recommendation for one query.
    

In [34]:
def mean_average_precision(all_recommendations):
    avg_precisions = []
    
    for song_name in df['song'].values:  # Iterate over all songs
        relevant_songs = get_relevant_songs(song_name)  # Get relevant songs for the query
        recommended_songs = recommender(song_name)  # Get recommendations for the query
        
        # Calculate average precision for this song
        avg_prec = average_precision(recommended_songs, relevant_songs)
        avg_precisions.append(avg_prec)
    
    return sum(avg_precisions) / len(avg_precisions)

# Calculate MAP
map_score = mean_average_precision(df['song'].values)
print(f"Mean Average Precision of Tuneify (MAP): {map_score:.4f}")


Mean Average Precision of Tuneify (MAP): 0.8105
