In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Data

In [12]:
df = pd.read_csv('/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv')

In [13]:
df.head(2)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."


In [14]:
print(df.shape)
print(df.isnull().sum())

(57650, 4)
artist    0
song      0
link      0
text      0
dtype: int64


In [15]:
# df = df.head(5000)

In [16]:
print(df.head(3))
print(df.shape)

  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
(57650, 4)


# Preprocessing

In [17]:
df['text'] = df['text'].str.lower().replace(r'\w\s', ' ').replace(r'\n', ' ',  regex = True)

# Tokenizatoin Text

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemer = PorterStemmer()

In [9]:
def tokenization(text):
    tokens = nltk.word_tokenize(text)
    stemming = [stemer.stem(w) for w in tokens]
    return " ".join(stemming)

In [10]:
df['text'] = df['text'].apply(lambda x: tokenization(x))


KeyboardInterrupt



# Tfidf-Vectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
tfidfvector = TfidfVectorizer(analyzer = 'word', stop_words = 'english')
matrix = tfidfvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [21]:
similarity[0]

array([1.        , 0.03574138, 0.01274465, ..., 0.06063918, 0.05741058,
       0.13443014])

In [22]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,link,text
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,i 'm waitin ' for you babi i 'm sit all alon i...


# Recommender System

In [23]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [25]:
recommendation('Blue, Blue Day')

["Runnin' Blue",
 "It Keeps You Runnin'",
 'Black And Blue',
 'Baby Blue',
 'Midnight Blue',
 'All The Things You Are',
 "I'd Rather Be Blue Over You (Than Happy With Somebody Else)",
 'Blue Smoke (Kohu-Auwahi)',
 'How Much I Lied',
 'Born To Sing The Blues',
 'Blue Jean',
 'Blue Suede Shoes',
 'Am I Blue?',
 'Midnight Blues',
 'Blue Island',
 'Crying Over You',
 'Blue Christmas',
 'Almost Blue',
 'I Just Like The Feeling',
 "Miller's Angels"]

# File as Pickle

In [26]:
import pickle
with open('similarity.pkl', 'wb') as file:
    pickle.dump(similarity, file)
    
with open('df.pkl', 'wb') as file:
    pickle.dump(df, file)