In [1]:
import pandas as pd


# Load Data

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')

In [3]:
df.head(2)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."


In [4]:
print(df.shape)
print(df.isnull().sum())
print(df['text'][0])

(57650, 4)
artist    0
song      0
link      0
text      0
dtype: int64
Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?




# Sampling if needed

In [5]:
#df = df.sample(25000)
#df = df.head(25000)
df = df.head(5000).drop('link', axis=1).reset_index(drop=True)

In [6]:
print(df.head(3))
print(df.shape)
print(df['text'][0])

  artist                   song  \
0   ABBA  Ahe's My Kind Of Girl   
1   ABBA       Andante, Andante   
2   ABBA         As Good As New   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
(5000, 3)
Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that s

# Preprocessing

In [7]:
df['text'] = df['text'].str.lower().replace(r'\n',' ', regex=True).replace(r'\s+', ' ',regex=True)
#df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

print(df['text'][0])

look at her face, it's a wonderful face and it means something special to me look at the way that she smiles when she sees me how lucky can one fellow be? she's just my kind of girl, she makes me feel fine who could ever believe that she could be mine? she's just my kind of girl, without her i'm blue and if she ever leaves me what could i do, what could i do? and when we go for a walk in the park and she holds me and squeezes my hand we'll go on walking for hours and talking about all the things that we plan she's just my kind of girl, she makes me feel fine who could ever believe that she could be mine? she's just my kind of girl, without her i'm blue and if she ever leaves me what could i do, what could i do? 


In [8]:
df.head(1)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face and it..."


# Tokenizatoin Text

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
stemer = PorterStemmer()

In [10]:
def tokenization(text):
    tokens = nltk.word_tokenize(text)
    stemming = [stemer.stem(w) for w in tokens]
    return " ".join(stemming)

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
print(df['text'][0])

look at her face , it 's a wonder face and it mean someth special to me look at the way that she smile when she see me how lucki can one fellow be ? she 's just my kind of girl , she make me feel fine who could ever believ that she could be mine ? she 's just my kind of girl , without her i 'm blue and if she ever leav me what could i do , what could i do ? and when we go for a walk in the park and she hold me and squeez my hand we 'll go on walk for hour and talk about all the thing that we plan she 's just my kind of girl , she make me feel fine who could ever believ that she could be mine ? she 's just my kind of girl , without her i 'm blue and if she ever leav me what could i do , what could i do ?


# TF-IDF Vectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidfvector = TfidfVectorizer(analyzer = 'word', stop_words = 'english')
tf_score_Matrix = tfidfvector.fit_transform(df['text'])
print(tf_score_Matrix)
similarity = cosine_similarity(tf_score_Matrix)
print("\n",similarity.shape)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 258627 stored elements and shape (5000, 15315)>
  Coords	Values
  (0, 7783)	0.14800893755871428
  (0, 4614)	0.19208908778681985
  (0, 15065)	0.11335149824946725
  (0, 8285)	0.11326275645518884
  (0, 12447)	0.10214356302759937
  (0, 12569)	0.1588188828284565
  (0, 14784)	0.06945130194398853
  (0, 12316)	0.10858415816142558
  (0, 7873)	0.1588188828284565
  (0, 4761)	0.2150147124960997
  (0, 7116)	0.21603097340415228
  (0, 7242)	0.4742656302735608
  (0, 5482)	0.358580846398836
  (0, 8045)	0.1379218051725576
  (0, 4750)	0.13994290291343636
  (0, 4848)	0.2517822250349658
  (0, 1273)	0.19907037399870095
  (0, 1522)	0.20890900067423668
  (0, 7515)	0.19188629092261933
  (0, 14695)	0.1837409666840814
  (0, 9730)	0.157680355706392
  (0, 6262)	0.09272523358398752
  (0, 12688)	0.18528163572080875
  (0, 5897)	0.09267965289174833
  (0, 7712)	0.0609544775748421
  :	:
  (4999, 14686)	0.15734378181914876
  (4999, 10913)	0.09754225772451629
 

In [15]:
similarity[3886]
#sorted(similarity[3886], reverse=True)

array([0.18708893, 0.02070154, 0.02585464, ..., 0.00517635, 0.01995685,
       0.02371169])

In [16]:
df[df['song'] == 'Blue, Blue Day']

Unnamed: 0,artist,song,text
3886,Dean Martin,"Blue, Blue Day","it 's been a blue , blue day , i feel like run..."


# Recommender System

In [17]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1]) # sorted list of 2-tuples (index, similarity)
    print("\n", distances)
    
    songs = []
    for tupple in distances[1:4]:
        songs.append(df.iloc[tupple[0]].song)
        
    return songs

In [18]:
recommendation('Blue, Blue Day')


 [(3886, np.float64(1.0000000000000004)), (4480, np.float64(0.5378354837422571)), (4454, np.float64(0.42504245423655956)), (247, np.float64(0.3787779000367479)), (1132, np.float64(0.37262708057503285)), (4750, np.float64(0.3576976385333633)), (4769, np.float64(0.3537162416654177)), (1099, np.float64(0.34466799579608187)), (3888, np.float64(0.3232167313281238)), (4962, np.float64(0.32105891430218453)), (3401, np.float64(0.3176026544401364)), (3756, np.float64(0.31665159672454923)), (3396, np.float64(0.31652355910079777)), (1353, np.float64(0.31504395631832577)), (583, np.float64(0.31304151156517984)), (1278, np.float64(0.31031689092642606)), (9, np.float64(0.30982722665517026)), (1146, np.float64(0.3093812970264756)), (4916, np.float64(0.3020868123280641)), (282, np.float64(0.2979295805513956)), (3502, np.float64(0.2915204216000299)), (370, np.float64(0.2900267645723575)), (1210, np.float64(0.28224831772456654)), (582, np.float64(0.2732463039106321)), (3391, np.float64(0.27160830187361

["Runnin' Blue", "It Keeps You Runnin'", 'Black And Blue']

# File as Pickle

In [19]:
import pickle
with open('similarity.pkl', 'wb') as file:
    pickle.dump(similarity, file)
    
with open('df.pkl', 'wb') as file:
    pickle.dump(df, file)