In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('songdata.csv')
df.head(14)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \r\nI'm sitting all ...


In [4]:
df.shape

(57650, 4)

In [5]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [6]:
df.shape

(5000, 3)

In [7]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [8]:
df['text'][0]

"why are you so paranoid?  \r don't be so paranoid  \r don't be so...  \r   \r baby, don't worry about it  \r hey there, don't even think about it  \r   \r you worry bout the wrong things, the wrong things  \r you worry bout the wrong things, the wrong things  \r you worry bout the wrong things, the wrong things  \r you worry bout the wrong things, the wrong things  \r   \r all of the time, you really wanna spend your whole life alone  \r a little time out might do ya good, might do us good before you're done for\r good  \r because i could make it good, i could make it hood, i could make you come, i\r could make you gooo  \r i could make you hot, i could make you fly, make you touch the sky, hey, maybe\r sooo  \r all of the time, he be up in my, checking through my cell phone, baby no  \r you wanna kill the vibe, on another night, here's another fight, oh, here we\r go (oh, here we go)  \r   \r baby, don't worry 'bout it  \r lady, we'll go out to the floor  \r   \r anyway, they don't k

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [10]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [13]:
similarity[0]

array([1.        , 0.04056035, 0.0711061 , ..., 0.13695306, 0.0237866 ,
       0.02694411])

In [16]:
df[df['song']=='Bang-A-Boomerang'].index[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

# recommedation function

In [16]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [17]:
recommendation('Bang')

['The Prime Of Your Love',
 'Give Me A Bit',
 'Solsbury Hill',
 'Bang Bang',
 'Bang Bang Bang',
 'Shoot From The Hip',
 'Pop That Thang',
 'Just Like Forever',
 "Nothin' Else",
 'Show Me',
 'Funky Music Sho Nuff Turns Me On',
 'I Saw Her Standing There',
 'Loved',
 'Love Me Tender',
 'The Way You Move',
 'Another Day',
 'Who Will Love Me Now',
 'Learn To Love',
 "I Think I'm In Love",
 'Birthday Song']

In [18]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))