In [20]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [21]:
df = pd.read_csv("spotify_millsongdata.csv")

In [22]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [23]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [24]:
df.shape

(57650, 4)

In [25]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [26]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [27]:
df.head(10)

Unnamed: 0,artist,song,text
0,Gary Numan,This Is New Love,We are strangers here \r\nI suppose \r\nWe a...
1,Reo Speedwagon,M-E,"Lately we're acting like two kids in school, ..."
2,UB40,Don't Blame Me,I think its only fair to say \r\nWe'd welcome...
3,Toto,It's A Feeling,"It's a feeling, I don't belong here \r\nBut a..."
4,Leo Sayer,Easy To Love,It's easy to love when I love someone like you...
5,Rihanna,Talk That Talk,"Talk that talk to me, yeah \r\nTalk that talk..."
6,Rush,Open Secrets,It went right by me \r\nAt the time it went o...
7,Joy Division,Atmosphere,"Walk in silence, \r\nDon't walk away, in sile..."
8,Kenny Chesney,Being Drunk's A Lot Like Loving You,"Well I drank till I stumbled, \r\nI drank til..."
9,Billy Joel,That's Not Her Style,Some people think \r\nThat she's one of those...


In [28]:
df['text'][0]

"We are strangers here  \r\nI suppose  \r\nWe are not welcome  \r\nOr so I'm told.  \r\nWe are not old friends  \r\nBut believe this,  \r\nWe can be nightmares.  \r\n  \r\nPicture the man when the heartbeat stops  \r\nThis is new love.  \r\n  \r\nWe are the hunters,  \r\nSo one by one  \r\nYou know we'll find you.  \r\n  \r\nPicture the man when the heartbeat stops  \r\n  \r\nThese boys of passion  \r\nWill rule the world  \r\nPut their fingers in a dyke  \r\nWell you know it's what she needed.  \r\nThese boys of passion  \r\nWith cruel idiot smiles  \r\nFight for you.  \r\nYou know, they said so.  \r\n  \r\nCold fascination  \r\nWith dead sound.  \r\nOh God let me sleep  \r\nForever.  \r\n  \r\nPicture the man when the heartbeat stops\r\n\r\n"

In [29]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [30]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [31]:


# Initialize the Snowball stemmer
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Stem each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Join the stemmed tokens back into a string
    return " ".join(stemmed_tokens)

In [32]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [33]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [34]:
similarity[0]

array([1.        , 0.01617398, 0.03134067, ..., 0.06989239, 0.01067665,
       0.00577356])

In [35]:
df[df['song'] == 'Darkness']

Unnamed: 0,artist,song,text


In [47]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [50]:
recommendation('Atmosphere')

['Fire',
 'Silence',
 'Walk Away',
 'Walk Away',
 'Sounds Of Silence',
 'Walk Tall',
 'Walking On Broken Glass',
 "I'd Rather Go Blind",
 'Remember Me',
 'Take My Hand',
 'Throwing Needles',
 "Man's Road",
 'Reach Out',
 'All I Want',
 'Walking In My Shoes',
 'Perfect World',
 'No One Can',
 'Gone Crazy',
 'Forever Gone Forever You',
 'Houses In Motion']

In [51]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))