In [57]:
import pandas as pd
import numpy as np

In [58]:
music=pd.read_csv('dataset/spotify_millsongdata.csv')

## PipeLine for the project
- 1. Data cleaning
- 2. data pre-Processing
- 3. text pre-Processing
- 4. Model building
- 5. testing
- 6. Deployment

## 1. Data Cleaning

In [59]:
music.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [60]:
music.shape

(57650, 4)

In [61]:
music.dtypes

artist    object
song      object
link      object
text      object
dtype: object

In [62]:
music.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [63]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [64]:
#  as the overview is an important column we will drop those rows which have nan values
music.dropna(inplace=True)

In [65]:
# also checking for duplicate values
music.duplicated().sum()

0

## 2.Data Pre-processing

In [66]:
music

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


### extracting important data from the columns

In [67]:
music['artist']= music.artist.apply(lambda x : x.lower().replace(' ',''))

In [68]:
music.head()

Unnamed: 0,artist,song,link,text
0,abba,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,abba,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,abba,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,abba,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,abba,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [69]:
music.dtypes

artist    object
song      object
link      object
text      object
dtype: object

In [70]:
music =music.sample(5000).drop('link', axis=1).reset_index(drop=True)

## 3. Text pre-processing
- stemming
- vectorization
- cosine similarity

In [71]:
music['text'] = music['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [72]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [ps.stem(w) for w in tokens]
    return " ".join(stemming)

In [73]:
music['text'] = music['text'].apply(lambda x: tokenization(x))
music['tags'] = music['text'] + music['artist']

In [74]:
music

Unnamed: 0,artist,song,text,tags
0,steviewonder,Moments Aren't Moments,"tender , warm and seren are the word that migh...","tender , warm and seren are the word that migh..."
1,p!nk,Hiccup,1 whi everi time i tri to tell you how i feel ...,1 whi everi time i tri to tell you how i feel ...
2,marilynmanson,Astonishing Panorama Of The Endtimes,the boy 's got a head like an atom bomb hang h...,the boy 's got a head like an atom bomb hang h...
3,erasure,Don't Say Your Love Is Killing Me,my eye are close and i 've noth more to say bu...,my eye are close and i 've noth more to say bu...
4,indigogirls,Life's So Strange,cold black coffe incess stream of car cigarett...,cold black coffe incess stream of car cigarett...
...,...,...,...,...
4995,michaeljackson,Alright Now,i wa a fool too blind to see i turn my back on...,i wa a fool too blind to see i turn my back on...
4996,tinaturner,Let's Spend The Night Together,"my , my , my , my do n't you worri 'bout what ...","my , my , my , my do n't you worri 'bout what ..."
4997,indigogirls,Crazy Game,crazi game i never should have start to play b...,crazi game i never should have start to play b...
4998,celinedion,A Song For You,i could n't live i could n't live without your...,i could n't live i could n't live without your...


In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
tf= TfidfVectorizer(max_features=3000,analyzer='word',stop_words='english')
vectors=tf.fit_transform(music.tags).toarray()

In [77]:
vectors.shape

(5000, 3000)

## 4. Model Building (content based consine similarity)

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)

In [79]:
similarity.shape

(5000, 5000)

## 

## 5. Model Testing

In [83]:
def recommendation(song_music):
    idx = music[music['song'] == song_music].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:11]:
        songs.append(music.iloc[m_id[0]].song)
        
    return songs

In [86]:
recommendation('Crying Over You')

['Cry Baby Cry',
 'Crying',
 'Cry, Cry, Cry',
 "Don't Cry",
 'Cry Me A River',
 "Isn't Life Strange",
 "It's All Over But The Crying",
 'Evil',
 'Cry For Love',
 'See You Tonite']

## 6. Deployment

In [89]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(music.to_dict(),open('music.pkl','wb'))