In [1]:
import pandas as pd

Import the Dataset

In [2]:
df = pd.read_csv("spotify_dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
# Check for missing values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
# Drop 'Link' column
df = df.sample(9000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Depeche Mode,Jezebel,They call you Jezebel \r\nWhenever we walk in...
1,Eurythmics,Caveman Head,Feel my body breathing in \r\nTouch the softn...
2,Toto,In A Word,"Stop, taking for granted \r\nI am the one who..."
3,Jim Croce,You Don't Mess Around With Jim,Uptown got it's hustlers \r\nThe bowery got i...
4,Phil Collins,Easy Lover,Easy lover \r\nShe'll get a hold on you belie...
5,John Waite,Back On My Feet Again,I was so lonely until I met you \r\nTold myse...
6,Aerosmith,Cry Me A River,Now you say you lonely \r\nYou cry the whole ...
7,Avril Lavigne,Mobile,Went back home again \r\nThis sucks gotta pac...
8,Utopia,Last Of The New Wave Riders,The last of the new wave riders \r\nWill be t...
9,X-Ray Spex,Crystal Clear,How do you feel now? I want to know \r\nClear...


In [9]:
df['text'][0]

"They call you Jezebel  \r\nWhenever we walk in  \r\nYou're going straight to hell  \r\nFor wanted acts of sin - they say,  \r\nAnd that I'll have to pay  \r\nBut I need you just this way  \r\n  \r\nThey call you Jezebel  \r\nFor what you like to wear  \r\nYou're morally unwell  \r\nThey say you never care for me  \r\nBut what the fail to see is that your games are the key  \r\n  \r\nOpen their eyes to the beauty  \r\nOpen their hearts to the fun  \r\nOpen their minds to the idea that you don't own someone  \r\n  \r\nThey call you Jezebel  \r\nWhenever men walk by  \r\nThey say that they can tell  \r\nThe longing in your eyes is real  \r\nAnd how you really feel  \r\nBut they can't see your appeal  \r\n  \r\nJezebel\r\n\r\n"

Text Cleaning/ Text Preprocessing

In [10]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kavin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

df['text'] = df['text'].apply(lambda x: tokenization(x))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [15]:
similarity[0]

array([1.        , 0.07104576, 0.0742877 , ..., 0.03975058, 0.06350326,
       0.01103543])

In [16]:
df[df['song'] == 'Dream Of Me']

Unnamed: 0,artist,song,text
5397,Omd,Dream Of Me,say it is n't right to be alon tonight so in l...


In [17]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [18]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))