In [55]:
import pandas as pd

In [56]:
df = pd.read_csv("spotify_millsongdata.csv")

In [57]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [58]:
df.shape

(57650, 4)

In [59]:
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [60]:
df = df.sample(10000).drop("link",axis=1).reset_index(drop=True)

In [61]:
df['text'][0]

"It was a Monday night when you told me it was over, babe  \r\nAnd by the Friday night, I knew that I would be okay  \r\nDon't say it was a good thing  \r\nDon't say it was the right thing to do  \r\nDon't say it was the best thing for the both of us  \r\nWhen I'm the one playing the fool  \r\n  \r\nWhat do you want from me when I just wanna restart  \r\nYou keep coming back for me when you're the one who tore us apart  \r\nThe truth is I'm better on my own  \r\nAnd I don't wanna live in the past  \r\nSo let me restart  \r\nSo let me restart  \r\nSo let me restart  \r\n  \r\nYou've been lightin' up my phone  \r\nWorried that I'll be alone tonight  \r\nWanna to make sure that I'm fine  \r\nBut, baby, you're not on my mind, no more  \r\nI know it was the best thing for the both of us  \r\nCause you're the one who looks like a fool  \r\n  \r\nWhat do you want from me when I just wanna restart  \r\nYou keep coming back for me when you're the one who tore us apart  \r\nThe truth is I'm bett

In [62]:
df.shape

(10000, 3)

# Text Cleaning

In [63]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', " ").replace(r'\n', " ", regex=True)

In [64]:
df

Unnamed: 0,artist,song,text
0,Sam Smith,Restart,it was a monday night when you told me it was ...
1,Taylor Swift,Santa Baby,"santa baby, slip a sable under the tree, for m..."
2,Pretenders,One More Time,you believed in me \r when you had nothing to...
3,XTC,Don't Let Us Bug Ya,"don't let us bug ya \r ooh, stop shakin' boy ..."
4,Avril Lavigne,Daydream,lalala-a (lalala-a) \r yea \r lalala-a (lala...
...,...,...,...
9995,Dusty Springfield,I Can't Hear You No More,here you are again \r tellin' me you're sorry...
9996,XTC,Season Cycle,season cycle moving round and round \r pushin...
9997,Alphaville,Apollo,show me a place that ain't hell \r if there's...
9998,Beach Boys,Island Fever,do you ever get the feelin' that you got to ge...


In [65]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [66]:
stemmer = PorterStemmer()

In [67]:
def token(txt):
  token = nltk.word_tokenize(txt)
  a = [stemmer.stem(w) for w in token]
  return " ".join(a)

In [68]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [69]:
df['text'].apply(lambda x: token(x))

Unnamed: 0,text
0,it wa a monday night when you told me it wa ov...
1,"santa babi , slip a sabl under the tree , for ..."
2,you believ in me when you had noth to gain you...
3,"do n't let us bug ya ooh , stop shakin ' boy u..."
4,lalala-a ( lalala-a ) yea lalala-a ( lalala-a ...
...,...
9995,"here you are again tellin ' me you 're sorri ,..."
9996,season cycl move round and round push life up ...
9997,show me a place that ai n't hell if there 's s...
9998,do you ever get the feelin ' that you got to g...


In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [72]:
matrix = tfid.fit_transform(df['text'])

In [73]:
similar = cosine_similarity(matrix)

In [74]:
similar[0]

array([1.        , 0.02826162, 0.0170191 , ..., 0.04234839, 0.00288983,
       0.00502852])

In [76]:
df[df['song'] =='Restart'].index[0]

0

# Recommender Function

In [79]:
def recommender(song_name):
  idx = df[df['song'] ==song_name].index[0]
  distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x:x[1])
  song = []
  for s_id in distance[1:21]:
    song.append(df.iloc[s_id[0]].song)
  return song

In [80]:
recommender("Restart")

['Rain Or Shine',
 'Let Down',
 'Let Me Go',
 'Even In Heaven',
 'Let Me Go',
 "Don't Let Go",
 'Let It Be (Live)',
 'Let It Be (Live)',
 'Let It Be',
 'Let It Be',
 'Senior Year Spring Musical',
 'Let Go',
 "I Can't Be Without You",
 'Stay',
 "Don't Let It Show",
 "Can't Let You Go",
 'Never Let Go',
 "Let's Go",
 'Let Her In',
 "Don't Cry Joe"]

In [81]:
import pickle

In [83]:
pickle.dump(similar, open("similarity", "wb"))

In [84]:
pickle.dump(df, open("df", "wb"))