In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Dean Martin,For Me And The Gal,The bells are ringing for me and my gal \r\nO...
1,Phish,If I Could,"Take me to another place, she said \r\nTake m..."
2,Mazzy Star,Be My Angel,"They say it's me, \r\nThat makes you do thing..."
3,Michael Jackson,Cinderella Stay Awhile,"Cinderella, stay awhile \r\nYou're the one \..."
4,Jimi Hendrix,Lookover Yonder,Look over yonder here come the blues \r\nThe ...
5,Dusty Springfield,Getting It Right,"Hey boy, don't be shy \r\nWaiting all alone ..."
6,Faith No More,She Loves Me She Loves Me Not,I'm here alone on the telephone line \r\nI'm ...
7,Marillion,A Collection,I've got a photograph \r\nI took a picture of...
8,Meat Loaf,If You Really Want To,"You say you're all alone, \r\nAnd after all t..."
9,Rascal Flatts,Lovin' Me,Now there are days when I don't have a clue wh...


In [9]:
df['text'][0]

"The bells are ringing for me and my gal  \r\nOh, the birds are singing for me and my gal  \r\nWell, everybody's been knowin', to a wedding they're goin'  \r\nAnd for weeks they've been sewing every Susie and Sal  \r\n  \r\nThey're congregatin' for me and my gal  \r\nWhile the parson's waitin' for me and my gal  \r\nAnd sometimes I'm gonna build a little home for two  \r\nThree or four or more, in love land for me and my gal  \r\n  \r\nAnd sometimes I'm gonna build a little home for two  \r\nThree or four or more, in love land for me and my gal\r\n\r\n"

In [10]:
# df = df.sample(5000)

In [11]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.        , 0.00816529, ..., 0.01818397, 0.01523591,
       0.00111311])

In [19]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
1709,UB40,Crying Over You,cri over you in the morn cri over you in the e...


In [20]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [21]:
recommendation('Crying Over You')

['All Cried Out',
 'Tears',
 "Don't Cry",
 'Cry A While',
 "Don't Cry No Tears",
 'Crying Out For Me',
 'Crying, Waiting, Hoping',
 'Stop Crying Your Heart Out',
 'Can Anyone Explain? (No, No, No!)',
 'When The Time Comes',
 'Key To My Heart',
 "Who's Crying Now",
 'I Cried Again',
 'Cry Forever',
 'Rolling Home',
 'Everyday Goodbyes',
 'Momma Cried',
 'Make Me Smile',
 'Still Day Beneath The Sun',
 'Destination Zero']

In [22]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))