# Data Collection
+ Spotify Music dataset is collected from the following link
https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('/Users/hanzilasohan/Deep Learning & Machine Learning/Music Recommender System/spotify_millsongdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [30]:
df.shape

(57650, 4)

In [31]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [32]:
df = df.sample(10000).drop('link',axis=1).reset_index(drop=True)

In [33]:
df.head()

Unnamed: 0,artist,song,text
0,Doors,Mental Floss,All deserters will be shot \r\nat 5 o'clock t...
1,Misfits,"Mommy, Can I Go Out And Kill Tonight?",Singled out the kids who are mean to me \r\nG...
2,Donna Summer,The Power Of One,You must always remember \r\nLife can be a ch...
3,Lana Del Rey,Never Let Me Go,"Hold me in your arms, \r\nLove me like your b..."
4,Kris Kristofferson,Jesus Was A Capricorn,Jesus was a Capricorn \r\nHe ate organic food...


In [34]:
#df = df.sample(10000)
#df

In [35]:
df['text'][0]

"All deserters will be shot  \r\nat 5 o'clock tomorrow morning  \r\nso get yourselves together  \r\nand quit messing around  \r\nNo more nonsense!  \r\nThe assassination occurred at 3.30  \r\nNo one was there to witness it  \r\nEven the breviaries had left their tapestries on the window  \r\nAnd we were all silenced  \r\nby the sad mildew cloud  \r\nthat followed around  \r\nI wish I was back in the land of the...  \r\nof the...  \r\nAaahm uhm...  \r\nYou can't pin that one on me  \r\nI didn't do a goddamn thing  \r\nI was just standing there  \r\nThen a bunch of guys came up  \r\nand started laying all this shit on me  \r\nNow what am I supposed to do?  \r\nI'm an American! You can't touch me!  \r\nDid you know all nuns are 42?  \r\nAnd their eyes are blue?  \r\nDid you know all table clothes  \r\nare white in France?  \r\nDid you know women wear underpants?  \r\nWell, they do!  \r\nHow does a musician imitate  \r\nthe sound of underpants sliding  \r\nover a woman's thighs,  \r\ndown 

# Text Preprocessing
+  Clean and preprocess the text by removing special characters, punctuation, and converting all letters to lowercase.
+  Tokenize the descriptions into individual words or phrases.
+  Remove stopwords (common words like "and," "the," "is," etc.) that don't provide much context.

In [36]:
df['text'] = df['text'].str.lower().replace(r'^\w\s',' ',regex=True).replace(r'\n',' ',regex=True)

In [37]:
df['text'][0]

"all deserters will be shot  \r at 5 o'clock tomorrow morning  \r so get yourselves together  \r and quit messing around  \r no more nonsense!  \r the assassination occurred at 3.30  \r no one was there to witness it  \r even the breviaries had left their tapestries on the window  \r and we were all silenced  \r by the sad mildew cloud  \r that followed around  \r i wish i was back in the land of the...  \r of the...  \r aaahm uhm...  \r you can't pin that one on me  \r i didn't do a goddamn thing  \r i was just standing there  \r then a bunch of guys came up  \r and started laying all this shit on me  \r now what am i supposed to do?  \r i'm an american! you can't touch me!  \r did you know all nuns are 42?  \r and their eyes are blue?  \r did you know all table clothes  \r are white in france?  \r did you know women wear underpants?  \r well, they do!  \r how does a musician imitate  \r the sound of underpants sliding  \r over a woman's thighs,  \r down over her ankles,  \r and over 

In [38]:
df['text'].head(10)

0    all deserters will be shot  \r at 5 o'clock to...
1    singled out the kids who are mean to me  \r ge...
2    you must always remember  \r life can be a cha...
3    hold me in your arms,  \r love me like your be...
4    jesus was a capricorn  \r he ate organic food ...
5    black dog barking in the cold grey light  \r h...
6    wake up, (wake up)  \r grab a brush and put a ...
7    along came the f-15 the swiftest on the line  ...
8     am what i am  \r i don't want praise i don't ...
9    the brick jail doors that closed behind are cr...
Name: text, dtype: object

# Tokenization (NLTK)
+ Convert the tokenized descriptions into numerical representations that can be used by machine learning models.

In [39]:
import nltk
#nltk.download('punkt_tab')

In [40]:
from nltk.stem.porter import PorterStemmer

In [41]:
stemmer = PorterStemmer()

In [42]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a =[stemmer.stem(w) for w in token]
    return " ".join(a)

In [43]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [44]:
df['text'].apply(lambda x: token(x))

0       all desert will be shot at 5 o'clock tomorrow ...
1       singl out the kid who are mean to me get strai...
2       you must alway rememb life can be a challeng l...
3       hold me in your arm , love me like your best f...
4       jesu wa a capricorn he ate organ food he belie...
                              ...                        
9995    wish a wish i dream to dream i tri to tri and ...
9996    outsid these , four wall lie , anarchi babe un...
9997    close your eye and tri to get to sleep now do ...
9998    time to make a chang time to rearrang sooner o...
9999    ( instrument intro ) i do n't want to settl do...
Name: text, Length: 10000, dtype: object

# Vectorization(TF IDF)
+  We can use techniques like TF-IDF (Term Frequency-Inverse Document Frequency) or word embeddings (Word2Vec, GloVe) for this purpose.
# Content-Based Filtering
+ one In our case, content-based filtering might be more suitable since we're focusing on analyzing the music lyrics(text). This approach recommends items similar to those the user has shown interest in.
+ two Calculate similarity scores between musics based on their preprocessed music lyrics(texts)
+ three Recommend musics that have similar descriptions to the ones the user has liked or interacted with in the past.

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
tfid = TfidfVectorizer(analyzer='word',stop_words='english')

In [47]:
matrix = tfid.fit_transform(df['text']).toarray()
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
simillar = cosine_similarity(matrix)

In [49]:
simillar[0]

array([1.        , 0.01650101, 0.00508185, ..., 0.01418955, 0.        ,
       0.00100173])

# Create Recommender Function
+ This function is created in a way so that users can search for a song by the song's name. The function will return a list of similar songs for any chosen song.

In [51]:
df[df['song']=="Doors"].index[0]

4788

In [52]:
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(simillar[idx])), reverse=True, key=lambda x:x[1])
    song=[]
    for song_id in distance[1:8]:
        song.append(df.iloc[song_id[0]].song)
    return song

In [54]:
recommender("Doors")

['For The First Time',
 'Everywhere I Go',
 'If I Was The One',
 'Behind Closed Doors',
 "I Wonder Who's Kissing Her Now",
 'Behind Closed Doors',
 'Well I Wonder']

# Store it in a pickle file

In [None]:
import pickle

In [None]:
pickle.dump(simillar, open("similarity.pkl","wb")) #store my simillar
pickle.dump(df, open("df.pkl","wb")) #store my dataframe