In [1]:
import pandas as pd

Import the Dataset

In [2]:
df = pd.read_csv("../Dataset/spotify_dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
# Check for missing values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
# Drop 'Link' column
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Patti Smith,Jubilee,Oh glad day to celebrate 'Neath the cloudless ...
1,Jason Mraz,Did I Fool Ya?,"When daddy sings \r\nHes an auctioneer, aucti..."
2,Cinderella,Night Songs,Workin' this job ain't payin' the bills \r\nS...
3,Black Sabbath,It's Alright,Told you once about your friends and neighbour...
4,Don McLean,Sister Fatima,The spirit of Fatima still rules the Earth \r...
5,Nitty Gritty Dirt Band,Travelin' Mood,Feelin' kinda sad and lonely \r\nI lost my sw...
6,America,Garden Of Peace,I look inside at the scarlet room \r\nPlaces ...
7,Nitty Gritty Dirt Band,One Sure Honest Line,"I made a living writing lover's dream words, ..."
8,Johnny Cash,Big Bad John,"Big John, big John \r\n \r\nEvery morning at..."
9,Clash,The Equaliser,No! Gang boss no! \r\nWe don't want the whip!...


In [9]:
df['text'][0]

"Oh glad day to celebrate 'Neath the cloudless sky  \r\nAir so sweet Water pure  \r\nFields ripe with rye Come one, come all  \r\nGather round Discard your Sunday shoes  \r\nCome on now Oh my land  \r\nBe a jubilee Come on girl  \r\nCome on boy Be a jubilee  \r\n  \r\nOh my land Oh my good  \r\nPeople don't be shy Weave the birth of harmony  \r\nWith children's happy cries Hand in hand  \r\nWe're dancing around In a freedom ring  \r\nCome on now Oh my land  \r\nBe a jubilee Come on girl  \r\nCome on boy Be a jubilee  \r\n  \r\nWe will never fade away Doves shall multiply  \r\nYet I see hawks circling the sky Scattering our glad day  \r\nWith debt and despair What good hour  \r\nWill restore our troubled air? Come on people  \r\nGather round You know what to do  \r\nCome on people Oh my land  \r\nWhat be troubling Oh my land  \r\nWhat be troubling What be troubling  \r\nWhat be troubling you  \r\n  \r\nWe are love and the future We stand in the midst of fury and weariness  \r\nWho dream

Text Cleaning/ Text Preprocessing

In [10]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kavin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

df['text'] = df['text'].apply(lambda x: tokenization(x))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [15]:
similarity[0]

array([1.        , 0.04383557, 0.01794606, ..., 0.02556392, 0.02732831,
       0.02476589])

In [16]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
4110,UB40,Crying Over You,cri over you in the morn cri over you in the e...


In [17]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [18]:
recommendation('Crying Over You')

["Don't Cry",
 'She Never Makes Me Cry',
 'Cry A While',
 "Don't Cry Baby",
 "You Won't See Me Cry",
 "Please Don't Make Me Cry",
 "I'll Cry Instead",
 'Lonesome Lullaby',
 'No One',
 "I've Been Wrong",
 'Do It Again',
 'Two Tears',
 "Don't Come Cryin' To Me",
 'The Crying Game',
 'Cry Forever',
 'Crying Time',
 'I Let The Music Speak',
 "Don't Mind If I Do",
 'The Boy In The Bubble',
 '977']

In [19]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))