# Data Collection
+ Spotify Music dataset is collected from the following link
https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('/Users/hanzilasohan/Music Recommendation App/spotify_millsongdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df = df.sample(5000).drop('link',axis=1).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,artist,song,text
0,W.A.S.P.,I Don't Need No Doctor,I don't need no doctor \r\n'Cause I know what...
1,Bob Seger,Long Song Comin',Politician 'bout to make his speech \r\nSigna...
2,Venom,Mystique,"Mystique our majesty, Goddess unwind \r\nMyst..."
3,Guns N' Roses,One In A Million,Yes I needed some time to get away \r\nI need...
4,Hanson,Love Song,The wind--it blows through the trees \r\nClai...


In [9]:
# df = df.sample(10000)
# df

In [10]:
df['text'][0]

"I don't need no doctor  \r\n'Cause I know what's ailing me  \r\nI don't need no doctor  \r\n'Cause I know what's ailing me (yes, I do)  \r\nAll I need is my baby  \r\nYou don't know I'm in misery  \r\n  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\n  \r\nI don't need no doctor  \r\nMy prescription tells me that  \r\nI don't need no doctor  \r\nMy prescription tells me that  \r\nAll I need is my baby  \r\nYou don't know I'm in misery  \r\n  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\n  \r\nWell, the doctor said I need rest - ooh, ooh  \r\nHe put me on the critical list - ooh, ooh  \r\nKeeping me safe from harm - ooh, ooh  \r\nAll I need is her sweet charm - ooh, ooh  \r\nHe gave me a medical lotion, that  \r\nwouldn't do  \r\nOoh - yeah, my motion, oh yeah, no  \r\ndoctor no !  \r\n  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\nI don't need no doctor  \r\n  \

# Text Preprocessing
+  Clean and preprocess the text by removing special characters, punctuation, and converting all letters to lowercase.
+  Tokenize the descriptions into individual words or phrases.
+  Remove stopwords (common words like "and," "the," "is," etc.) that don't provide much context.

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s',' ',regex=True).replace(r'\n',' ',regex=True)

In [13]:
df['text'][0]

" don't need no doctor  \r 'cause i know what's ailing me  \r i don't need no doctor  \r 'cause i know what's ailing me (yes, i do)  \r all i need is my baby  \r you don't know i'm in misery  \r   \r i don't need no doctor  \r i don't need no doctor  \r   \r i don't need no doctor  \r my prescription tells me that  \r i don't need no doctor  \r my prescription tells me that  \r all i need is my baby  \r you don't know i'm in misery  \r   \r i don't need no doctor  \r i don't need no doctor  \r i don't need no doctor  \r i don't need no doctor  \r   \r well, the doctor said i need rest - ooh, ooh  \r he put me on the critical list - ooh, ooh  \r keeping me safe from harm - ooh, ooh  \r all i need is her sweet charm - ooh, ooh  \r he gave me a medical lotion, that  \r wouldn't do  \r ooh - yeah, my motion, oh yeah, no  \r doctor no !  \r   \r i don't need no doctor  \r i don't need no doctor  \r i don't need no doctor  \r i don't need no doctor  \r   \r i don't need no doctor  \r i don't

In [14]:
df['text'].head(10)

0     don't need no doctor  \r 'cause i know what's...
1    politician 'bout to make his speech  \r signal...
2    mystique our majesty, goddess unwind  \r mysti...
3    yes i needed some time to get away  \r i neede...
4    the wind--it blows through the trees  \r claim...
5    "beautiful reality"  \r   \r i sometimes belie...
6    oh, to be prince caspian afloat upon the waves...
7    this is the rags to riches story  \r of the in...
8     enter your life now make no mistake  \r can y...
9    all night long  \r hollywood  \r all night lon...
Name: text, dtype: object

# Tokenization (NLTK)
+ Convert the tokenized descriptions into numerical representations that can be used by machine learning models.

In [16]:
import nltk
#nltk.download('punkt_tab')

In [17]:
from nltk.stem.porter import PorterStemmer

In [18]:
stemmer = PorterStemmer()

In [19]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a =[stemmer.stem(w) for w in token]
    return " ".join(a)

In [20]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [21]:
df['text'] = df['text'].apply(lambda x: token(x))

In [22]:
df['text']

0       do n't need no doctor 'caus i know what 's ail...
1       politician 'bout to make hi speech signal for ...
2       mystiqu our majesti , goddess unwind mystiqu o...
3       ye i need some time to get away i need some pe...
4       the wind -- it blow through the tree claim tho...
                              ...                        
4995    am on a lone road and i am travel travel , tra...
4996    when i 'm walk besid her , peopl tell me i 'm ...
4997    you may not recal the moment that you ask me b...
4998    you give me life like lot of oxygen you treat ...
4999    he got here and wrinkl scare and cryin ' the s...
Name: text, Length: 5000, dtype: object

# Vectorization(TF IDF)
+  We can use techniques like TF-IDF (Term Frequency-Inverse Document Frequency) or word embeddings (Word2Vec, GloVe) for this purpose.
# Content-Based Filtering
+ one In our case, content-based filtering might be more suitable since we're focusing on analyzing the music lyrics(text). This approach recommends items similar to those the user has shown interest in.
+ two Calculate similarity scores between musics based on their preprocessed music lyrics(texts)
+ three Recommend musics that have similar descriptions to the ones the user has liked or interacted with in the past.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
tfid = TfidfVectorizer(analyzer='word',stop_words='english')

In [26]:
matrix = tfid.fit_transform(df['text']).toarray()
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
simillar = cosine_similarity(matrix)

In [28]:
simillar[0]

array([1.        , 0.00613287, 0.00178438, ..., 0.00652026, 0.00111279,
       0.02178839])

# Create Recommender Function
+ This function is created in a way so that users can search for a song by the song's name. The function will return a list of similar songs for any chosen song.

In [42]:
df[df['song']=="Love Song"].index[0]

4

In [46]:
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(simillar[idx])), reverse=True, key=lambda x:x[1])
    song=[]
    for song_id in distance[1:8]:
        song.append(df.iloc[song_id[0]].song)
    return song

In [48]:
recommender("Love Song")

["There's Nothing Better Than Love",
 'Do You Love Me That Much',
 'Who Do You Love',
 'Lifelong Passion',
 'Angel',
 'I Want Your Love',
 'Like You Do']

# Store it in a pickle file

In [50]:
import pickle

In [52]:
pickle.dump(simillar, open("similarity","wb")) #store my simillar

In [54]:
pickle.dump(df, open("df","wb")) #store my dataframe