In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [15]:
df.shape

(57650, 4)

In [16]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [18]:
df.shape

(5000, 3)

In [19]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [20]:
df['text'][0]

'space   gimme wide open space   with the sun and the rain in my hair   and the wind in my face, oh...      space   gimme wide open space   with the sun and the rain in my hair   every breath that i take, oh...      space to cross, no pain, no fear   space to cross, far away from here   space to cross, no pain, no fear   space to cross, far away from here      space   gimme wide open space   with the sun and the rain in my hair   and the wind in my face, oh...      space to cross, no pain, no fear   space to cross, far away from here   space to cross, no pain, no fear   space to cross, far away from here      your face and mine   the color of the earth   we both will scream   we scream when we give birth      we both will cry when our parents leave us   we both will die, someday.      space   gimme wide open space   with the sun and the rain in my hair   and the wind in my face, oh...      your face and mine   the color of the earth   we both will scream   we scream when we give birth 

In [21]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [22]:
tokenization('this is called love loved loving')

'thi is call love love love'

In [23]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:

df['text']

0       space gim me wide open space with the sun and ...
1       is everyth a bait hook ? and are there lock on...
2       if i were you , i would n't wast my time you c...
3       it 's easi to feel like you 're all alon to fe...
4       if i had a boat i 'd go out on the ocean and i...
                              ...                        
4995    the name she gave wa carolin daughter of a min...
4996    ( kirsti maccol ) she call me up the other day...
4997    oh , come ye o come ye to bethlehem come and b...
4998    when you walk into the room , there wa voodoo ...
4999    cool as the water melt from the winter snow yo...
Name: text, Length: 5000, dtype: object

In [34]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)
matrix.shape

(5000, 17348)

In [35]:

similarity

array([[1.        , 0.0077625 , 0.01243158, ..., 0.0126471 , 0.        ,
        0.03619879],
       [0.0077625 , 1.        , 0.00244527, ..., 0.00709489, 0.00864556,
        0.01694274],
       [0.01243158, 0.00244527, 1.        , ..., 0.01660387, 0.07453968,
        0.06333891],
       ...,
       [0.0126471 , 0.00709489, 0.01660387, ..., 1.        , 0.        ,
        0.04702709],
       [0.        , 0.00864556, 0.07453968, ..., 0.        , 1.        ,
        0.00722546],
       [0.03619879, 0.01694274, 0.06333891, ..., 0.04702709, 0.00722546,
        1.        ]])

In [36]:
similarity[0]

array([1.        , 0.0077625 , 0.01243158, ..., 0.0126471 , 0.        ,
       0.03619879])

In [40]:
sorted(list(enumerate(similarity[0])))

[(0, 1.0),
 (1, 0.0077625008451945145),
 (2, 0.012431584294986914),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0059414772266021465),
 (6, 0.011570685400139533),
 (7, 0.013167624239300684),
 (8, 0.0470484175106023),
 (9, 0.03238658624926976),
 (10, 0.01164958666257941),
 (11, 0.0021346972611495565),
 (12, 0.0030369179275183717),
 (13, 0.022505227875102205),
 (14, 0.0),
 (15, 0.0),
 (16, 0.017400704737170692),
 (17, 0.004180246459417685),
 (18, 0.01370292140362462),
 (19, 0.02575685143099586),
 (20, 0.03300916160341164),
 (21, 0.020201726537909477),
 (22, 0.08905307143158309),
 (23, 0.3359336054508474),
 (24, 0.009014318904467734),
 (25, 0.0),
 (26, 0.0013801478851137021),
 (27, 0.010121422125534666),
 (28, 0.007796450209439197),
 (29, 0.09911777002142515),
 (30, 0.0),
 (31, 0.018548926586707183),
 (32, 0.005504099281098096),
 (33, 0.02128090392685724),
 (34, 0.06274419330980298),
 (35, 0.030600795888288786),
 (36, 0.1820364663190657),
 (37, 0.0021258689487254016),
 (38, 0.03220915052518753),
 (39, 0.

In [29]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [30]:
recommendation('Alma Mater')

['Brand New Dance (featuring June Carter)',
 'Hyacinth House',
 'Lentil',
 'New Way To Love',
 "Nothin' New For New Year",
 'Long Ago (And So Far Away)',
 'Pledging My Love',
 'Lose Control',
 'New Thing',
 'New York',
 "What Are You Doing New Year's Eve?",
 "Leavin' On Your Mind",
 "I'm Yours",
 "You've Got Possibilities",
 'Let Me Live',
 'Singing The Blues',
 'Things Meant To Be',
 'Brand New Second Hand',
 'A New Star',
 'Happy X-mas']

In [48]:

import pickle
pickle.dump(similarity,open('similarity.song','wb'))
pickle.dump(df,open('df.song','wb'))