In [1]:
import pandas as pd

Import the Dataset

In [2]:
df = pd.read_csv("../Dataset/spotify_dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
# Check for missing values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
# Drop 'Link' column
df = df.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Kenny Chesney,Always Gonna Be You,I could jump on some ol' highway \r\nRun a th...
1,Rainbow,Tite Squeeze,I ain't lyin' to you baby...I hope you don't r...
2,Queen,Killer Queen,She keeps her Moet et Chandon \r\nIn her pret...
3,Owl City,Enchanted,"There I was again tonight forcing laughter, fa..."
4,Justin Bieber,Somebody To Love,Oh oh for you I'd write a symphony! \r\nI'd t...
5,Uriah Heep,Imagination,Sometimes I'm certain \r\nSometimes my mind i...
6,Neil Young,Big Time,Gonna leave the pain behind \r\nGonna leave t...
7,Ace Of Base,Wheel Of Fortune,What you gonna tell your dad \r\nIt's like a ...
8,Wishbone Ash,Lady Whiskey,"Lady Whiskey, such a sad sight, stumblin' as s..."
9,Marianne Faithfull,Go Away From My World,"Go away from my world, \r\nLet me dream alone..."


In [9]:
df['text'][0]

"I could jump on some ol' highway  \r\nRun a thousand miles or more  \r\nUnlock some hidden mystery  \r\nBehind a distant door  \r\nI could sail the seven oceans  \r\nTil I crawl upon some long forgotten shore  \r\n  \r\nBut it's always gonna be you  \r\nAlways gonna be you I'm lookin' for  \r\n  \r\nI could climb a hundred mountains  \r\nLeave a hard ol' world behind  \r\nWander right across some prairie  \r\nLike a man out of his mind  \r\nI could walk and stare into the sun  \r\nLet it all just burn me deaf and blind  \r\nBut it's always gonna be you  \r\nAlways gonna be you I'm tryin' to find  \r\n  \r\nWhere does a man go for redemption  \r\nWhere does he take a broken heart  \r\nShouldn't there be some small exemption  \r\nIf he does all that it takes  \r\nTo admit to his mistakes  \r\nTil the truth batters and breaks his world apart  \r\n  \r\nI could ask for my forgiveness  \r\nFrom the heavens high above  \r\nTell myself my prayers are gonna somehow be enough  \r\nAnd lay down

Text Cleaning/ Text Preprocessing

In [10]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kavin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

df['text'] = df['text'].apply(lambda x: tokenization(x))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [15]:
similarity[0]

array([1.        , 0.0234645 , 0.0394837 , ..., 0.11192459, 0.00801501,
       0.02874274])

In [16]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
9519,ABBA,Crying Over You,i 'm waitin ' for you babi i 'm sit all alon i...


In [17]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [18]:
recommendation('Crying Over You')

["Cryin'",
 'Epitaph',
 "Cryin' Eyes",
 'Put It On',
 'Hank And Joe And Me',
 'Moment Of Forever',
 'Blue, Blue Day',
 'Color Of The Blues',
 'Baby Blue',
 'My Sweet Lord',
 'Any Other Day',
 'Blue Morning, Blue Day',
 "Aren't You The Girl",
 "Baby's Gone Blues",
 'Hound Dog',
 'Blue For You',
 "Lord I'm Gonna Love You",
 'Black And Blue',
 'Sonnet',
 'Here Comes My Wife']

In [19]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))