In [43]:
import pandas as pd

In [64]:
df = pd.read_csv("spotify_millsongdata.csv")

In [65]:
df = df.sample(10000).drop('link', axis = 1).reset_index(drop = True)

In [66]:
df

Unnamed: 0,artist,song,text
0,Hank Snow,Cuba Rhumba,Now if you're gonna cruise be sure that you'll...
1,Natalie Cole,Lollipops And Roses,Tell her you care \r\nEach time you speak \r...
2,Justin Bieber,The Christmas Song,[Justin Bieber] \r\nChestnuts roasting on an ...
3,Neil Sedaka,Another Sleepless Night,"Another sleepless night, \r\nI sit alone and ..."
4,Britney Spears,Ouch,"Let's, let's play pretend \r\nLike you're my ..."
...,...,...,...
9995,Gloria Gaynor,I'm Still Yours,"""Been a lot of places. \r\nAnd met a lot of g..."
9996,"Harry Connick, Jr.",The Jitterbug,Who's that hiding in the tree top? \r\nIt's t...
9997,Dean Martin,La Paloma,When I left Havana nobody saw me go \r\nBut m...
9998,Reo Speedwagon,Tough Guys,She doesn't like the tough guys. \r\nThey thi...


text Preprocessing

In [67]:
df['text'] = df['text'].str.lower().replace(r'^\w\s',' ').replace(r'\n', ' ', regex = True)


In [68]:
import nltk
from nltk.stem.porter import PorterStemmer

In [69]:
stemmer = PorterStemmer()

In [70]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [71]:
df['text'].apply(lambda x: token(x))

0       now if you 're gon na cruis be sure that you '...
1       tell her you care each time you speak make it ...
2       [ justin bieber ] chestnut roast on an open fi...
3       anoth sleepless night , i sit alon and cri , t...
4       let 's , let 's play pretend like you 're my f...
                              ...                        
9995    `` been a lot of place . and met a lot of guy ...
9996    who 's that hide in the tree top ? it 's that ...
9997    when i left havana nobodi saw me go but my lit...
9998    she doe n't like the tough guy . they think th...
9999    `` oh wow , man ! '' `` wait a second man . wh...
Name: text, Length: 10000, dtype: object

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
tfid = TfidfVectorizer(analyzer = 'word', stop_words = 'english')

In [74]:
matrix  = tfid.fit_transform(df['text'])

In [75]:
similer = cosine_similarity(matrix)

In [76]:
df['song']

0                   Cuba Rhumba
1           Lollipops And Roses
2            The Christmas Song
3       Another Sleepless Night
4                          Ouch
                 ...           
9995            I'm Still Yours
9996              The Jitterbug
9997                  La Paloma
9998                 Tough Guys
9999            Hot For Teacher
Name: song, Length: 10000, dtype: object

recommender function

In [80]:
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(similer[idx])), reverse=True, key = lambda x:x[1])
    song = []
    for s_id in distance[1:21]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [81]:
recommender("I'm Still Yours")

['Just As I Am',
 'Heart Full Of Soul',
 'A Lot Of You Left In Me',
 'If It Were You',
 'Before I Met You',
 'Baby',
 'Asleep From Day',
 'Always Alone',
 "I Don't Wanna Talk About It",
 'Piece Of My Heart',
 "Love's About To Change My Heart",
 'Heart With Your Name On It',
 'Nothing Like This',
 'You Got The Wrong Man',
 'No One But You',
 'Your Heart Will Lead You Home',
 'From The Heart',
 'Dreams Come True',
 'I Wanna Be Free',
 'If I Never See Your Face Again']

In [91]:
import pickle

In [92]:
pickle.dump(similer, open("similarity.pkl", "wb"))

In [93]:
pickle.dump(df, open("df.pkl", "wb"))