## Import libraries

In [92]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from cleantext import clean
import random


## Import data

In [93]:
data = pd.read_csv('./songdata.csv')
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


## Data preprocessing

- As the dataset is huge, sampling only `10000` samples from it

In [94]:
data = data.sample(n = 10_000, random_state = 42)

In [95]:
data.drop(columns = ["link"], inplace = True)

In [96]:
data.isna().sum()

artist    0
song      0
text      0
dtype: int64

In [97]:
data["artist"] = data['artist'].str.replace(",","")

- Removing the spaces b/w the names as Mikel Arteta and Mikel Jackson is not same

In [98]:
data["artist"] = data["artist"].apply(lambda x: x.replace(" ",""))

In [99]:
full_text = data["artist"] + " " + data["song"] + " " + data["text"]
full_text[:3]

56679    WishboneAsh Right Or Wrong Like to have you 'r...
224      Aerosmith This Little Light Of Mine This Littl...
32457    FallOutBoy Dance, Dance She says she's no good...
dtype: object

- Cleaning the texts

In [100]:
full_text = full_text.apply(lambda x: clean(x, no_line_breaks=True, no_numbers = True, replace_with_number="", no_punct=True))

- Finding Tf-idf score for the tokens and converting the texts to high dimensional vectors

In [101]:
tf_vect = TfidfVectorizer(stop_words = 'english', min_df = 5, max_df = 0.8)
text_vect = tf_vect.fit_transform(full_text)
print(len(tf_vect.get_feature_names_out()))

9075


- Find similarities between the songs

In [102]:
similarities = cosine_similarity(text_vect)

### Recommend songs

In [103]:
songs = data['song'].str.lower().values
songs

array(['right or wrong', 'this little light of mine', 'dance, dance', ...,
       'across the nation', 'between seventeen and twenty',
       'arrested for driving while blind'], dtype=object)

In [104]:
song_dict1 = dict(zip(songs, np.arange(10000)))  # No. of songs is from 0 to 10_000
song_dict2 = dict(zip(np.arange(10000), songs))

- Recommender function

In [105]:
def recommend(song):
    print(f"Selected song: {song.title()}")
    print("----------------------------------------------------------")
    song = song.lower()
    similar_songs = np.argsort(similarities[song_dict1[song]])[-6:-1]
    for i, song in enumerate(similar_songs):
        print(f"Recommended song{i+1}: {song_dict2[song].title()}")
        print("----------------------------------------------------------")

In [None]:
recommend(songs[random.randint(0, 10000)])  # selecting any random song by selecting random index between 0 to 10_000

Selected song: Come Tomorrow
----------------------------------------------------------
Recommended song1: Tomorrow Doesn'T Matter Tonight
----------------------------------------------------------
Recommended song2: Tomorrow Is Now
----------------------------------------------------------
Recommended song3: I Don'T Care (If Tomorrow Never Comes)
----------------------------------------------------------
Recommended song4: Look Out Here Comes Tomorrow
----------------------------------------------------------
Recommended song5: Tomorrow
----------------------------------------------------------
