In [1]:
!pip install datasets transformers sentence-transformers faiss-cpu

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [3]:
import pandas as pd
from urllib import request
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

In [4]:
data_top50 = pd.read_csv("top50MusicFrom2010-2019.csv")

In [5]:
data_top50.head(10)

Unnamed: 0,title,artist,the genre of the track,year,Beats.Per.Minute -The tempo of the song,"Energy- The energy of a song - the higher the value, the more energtic","Danceability - The higher the value, the easier it is to dance to this song","Loudness/dB - The higher the value, the louder the song","Liveness - The higher the value, the more likely the song is a live recording","Valence - The higher the value, the more positive mood for the song",Length - The duration of the song,Acousticness - The higher the value the more acoustic the song is,Speechiness - The higher the value the more spoken word the song contains,Popularity- The higher the value the more popular the song is
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78
5,Baby,Justin Bieber,canadian pop,2010,65,86,73,-5,11,54,214,4,14,77
6,Dynamite,Taio Cruz,dance pop,2010,120,78,75,-4,4,82,203,0,9,77
7,Secrets,OneRepublic,dance pop,2010,148,76,52,-6,12,38,225,7,4,77
8,Empire State of Mind (Part II) Broken Down,Alicia Keys,hip pop,2010,93,37,48,-8,12,14,216,74,3,76
9,Only Girl (In The World),Rihanna,barbadian pop,2010,126,72,79,-4,7,61,235,13,4,73


In [11]:
column_names = ["title", "artist", "genre", "year", "tempo", "energy", "dancability", "loudness", "liveness", "valence", "length", "acousticness", "speechiness", "popularity"]
data_top50.columns = column_names

In [8]:
data = data_top50.copy()

In [9]:
data.describe()

Unnamed: 0,year,tempo,energy,dancability,loudness,liveness,valence,length,acousticness,speechiness,popularity
count,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0
mean,2014.59204,118.545605,70.504146,64.379768,-5.578773,17.774461,52.225539,224.674959,14.3267,8.358209,66.52073
std,2.607057,24.795358,16.310664,13.378718,2.79802,13.102543,22.51302,34.130059,20.766165,7.483162,14.517746
min,2010.0,0.0,0.0,0.0,-60.0,0.0,0.0,134.0,0.0,0.0,0.0
25%,2013.0,100.0,61.0,57.0,-6.0,9.0,35.0,202.0,2.0,4.0,60.0
50%,2015.0,120.0,74.0,66.0,-5.0,12.0,52.0,221.0,6.0,5.0,69.0
75%,2017.0,129.0,82.0,73.0,-4.0,24.0,69.0,239.5,17.0,9.0,76.0
max,2019.0,206.0,98.0,97.0,-2.0,74.0,98.0,424.0,99.0,48.0,99.0


# Categorization
## We categorize features like Tempo, Energy, Dancability, Loudness, Liveness, Valence, Length, Acousticness, Speechiness and Popularity so we can Calculate similarity among musics based on their properties.

In [12]:
data["tempo_cat"] = pd.cut(data["tempo"],
                               bins=[-1., 70, 90, 120, 140, 180, np.inf],
                               labels=["Very Slow", "Slow", "Moderate", "Upbeat", "Fast", "Very Fast"])

In [13]:
data["energy_cat"] = pd.cut(data["energy"],
                               bins=[-1., 30, 50, 70, 90, np.inf],
                               labels=["Very Low", "Low", "Moderate", "High", "very High"])

In [14]:
data["dancability_cat"] = pd.cut(data["dancability"],
                               bins=[-1., 30, 50, 70, 90, np.inf],
                               labels=["Very Low", "Low", "Moderate", "High", "very High"])

In [15]:
data["loudness_cat"] = pd.cut(data["loudness"],
                               bins=[-np.inf, -20, -12, -6, -3, np.inf],
                               labels=["Very Quiet", "Quiet", "Moderate", "Loud", "Very Loud"])

In [16]:
data["liveness_cat"] = pd.cut(data["liveness"],
                               bins=[-1., 30, 60, np.inf],
                               labels=["Studio like", "Slight Audience", "Live Recording"])

In [17]:
data["valence_cat"] = pd.cut(data["valence"],
                               bins=[-1., 30, 60, np.inf],
                               labels=["Sad/Dark", "Neutral", "Happy/Uplifting"])

In [18]:
data["length_cat"] = pd.cut(data["length"],
                               bins=[-1., 120, 180, 240, 300, np.inf],
                               labels=["Very Short", "Short", "Average", "Long", "Very Long"])

In [19]:
data["acousticness_cat"] = pd.cut(data["acousticness"],
                               bins=[-1, 30, 70, np.inf],
                               labels=["Electronic/Produced", "Mixed", "Acoustic"])

In [20]:
data["speechiness_cat"] = pd.cut(data["speechiness"],
                               bins=[-1., 5, 20, 35, np.inf],
                               labels=["Instrumental", "Music-Dominant", "Lyric-Focused", "Speech-Heavy:"])

In [21]:
data["popularity_cat"] = pd.cut(data["popularity"],
                               bins=[-1., 30, 70, np.inf],
                               labels=["Underground", "Moderately Popular", "Very Popular"])

In [22]:
data.head(10)

Unnamed: 0,title,artist,genre,year,tempo,energy,dancability,loudness,liveness,valence,...,tempo_cat,energy_cat,dancability_cat,loudness_cat,liveness_cat,valence_cat,length_cat,acousticness_cat,speechiness_cat,popularity_cat
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,...,Moderate,High,Moderate,Loud,Studio like,Happy/Uplifting,Average,Electronic/Produced,Instrumental,Very Popular
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,...,Slow,very High,High,Loud,Slight Audience,Happy/Uplifting,Long,Electronic/Produced,Lyric-Focused,Very Popular
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,...,Moderate,High,High,Loud,Studio like,Happy/Uplifting,Average,Electronic/Produced,Music-Dominant,Very Popular
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,...,Moderate,very High,Moderate,Loud,Studio like,Happy/Uplifting,Long,Electronic/Produced,Instrumental,Very Popular
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,...,Moderate,High,Moderate,Loud,Studio like,Neutral,Average,Electronic/Produced,Instrumental,Very Popular
5,Baby,Justin Bieber,canadian pop,2010,65,86,73,-5,11,54,...,Very Slow,High,High,Loud,Studio like,Neutral,Average,Electronic/Produced,Music-Dominant,Very Popular
6,Dynamite,Taio Cruz,dance pop,2010,120,78,75,-4,4,82,...,Moderate,High,High,Loud,Studio like,Happy/Uplifting,Average,Electronic/Produced,Music-Dominant,Very Popular
7,Secrets,OneRepublic,dance pop,2010,148,76,52,-6,12,38,...,Fast,High,Moderate,Moderate,Studio like,Neutral,Average,Electronic/Produced,Instrumental,Very Popular
8,Empire State of Mind (Part II) Broken Down,Alicia Keys,hip pop,2010,93,37,48,-8,12,14,...,Moderate,Low,Low,Moderate,Studio like,Sad/Dark,Average,Acoustic,Instrumental,Very Popular
9,Only Girl (In The World),Rihanna,barbadian pop,2010,126,72,79,-4,7,61,...,Upbeat,High,High,Loud,Studio like,Happy/Uplifting,Average,Electronic/Produced,Instrumental,Very Popular


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 603 entries, 0 to 602
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   title             603 non-null    object  
 1   artist            603 non-null    object  
 2   genre             603 non-null    object  
 3   year              603 non-null    int64   
 4   tempo             603 non-null    int64   
 5   energy            603 non-null    int64   
 6   dancability       603 non-null    int64   
 7   loudness          603 non-null    int64   
 8   liveness          603 non-null    int64   
 9   valence           603 non-null    int64   
 10  length            603 non-null    int64   
 11  acousticness      603 non-null    int64   
 12  speechiness       603 non-null    int64   
 13  popularity        603 non-null    int64   
 14  tempo_cat         603 non-null    category
 15  energy_cat        603 non-null    category
 16  dancability_cat   603 non-

# Templates
## Two templates are created fo explaining music specifications
### One contains only the artist name, music genre and the year it was released which can be used to print onty this informations when recom system works
### The other one has all features in it which is ued to create Embeddings

In [27]:
template_artist_genre_year = "a music titled: {title}, by : {artist} released in : {year}, Genre : {genre}"

In [26]:
template_all_features_cat = "a {popularity_cat} and {loudness_cat} {speechiness_cat} music titled: {title}, by : {artist} released in : {year}, {genre} Genre {tempo_cat} Tempo, {energy_cat} Energy and {dancability_cat} Dancability . it has {liveness} Liveness and {valence_cat} Mood. {length_cat} Length and {acousticness_cat} acousticness"

In [30]:
texts_all = []

for i in range(len(data)):
  row = data.iloc[i]
  text = template_all_features_cat
  replacements = {
    "{loudness_cat}" : row.loudness_cat,
    "{dancability_cat}" : row.dancability_cat,
    "{energy_cat}" : row.energy_cat,
    "{popularity_cat}": row.popularity_cat,
    "{tempo_cat}" : row.tempo_cat,
    "{speechiness_cat}": row.speechiness_cat,
    "{loudness_cat}": row.loudness_cat,
    "{liveness}" : row.liveness_cat,
    "{valence_cat}" : row.valence_cat,
    "{length_cat}": row.length_cat,
    "{acousticness_cat}" : row.acousticness_cat,
    "{title}": row.title,
    "{artist}": row.artist,
    "{year}": str(row.year),
    "{genre}": row.genre,

    }

  for old_word, new_word in replacements.items():

    text = text.replace(old_word, new_word)

  texts_all.append(text)

In [36]:
texts_artist_genre_year = []

for i in range(len(data)):
  row = data.iloc[i]
  text = template_artist_genre_year
  replacements = {
    "{loudness_cat}" : row.loudness_cat,
    "{dancability_cat}" : row.dancability_cat,
    "{energy_cat}" : row.energy_cat,
    "{popularity_cat}": row.popularity_cat,
    "{tempo_cat}" : row.tempo_cat,
    "{speechiness_cat}": row.speechiness_cat,
    "{loudness_cat}": row.loudness_cat,
    "{liveness}" : row.liveness_cat,
    "{valence_cat}" : row.valence_cat,
    "{length_cat}": row.length_cat,
    "{acousticness_cat}" : row.acousticness_cat,
    "{title}": row.title,
    "{artist}": row.artist,
    "{year}": str(row.year),
    "{genre}": row.genre,

    }

  for old_word, new_word in replacements.items():

    text = text.replace(old_word, new_word)

  texts_artist_genre_year.append(text)

In [37]:
texts_all[3]

'a Very Popular and Loud Instrumental music titled: Bad Romance, by : Lady Gaga released in : 2010, dance pop Genre Moderate Tempo, very High Energy and Moderate Dancability . it has Studio like Liveness and Happy/Uplifting Mood. Long Length and Electronic/Produced acousticness'

In [58]:
texts_artist_genre_year[3]

'a music titled: Bad Romance, by : Lady Gaga released in : 2010, Genre : dance pop'

# The Model used for creating Embeddings

In [40]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
embeddings_all = model.encode(texts_all, show_progress_bar=True)


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

# Faiss to create the Index

In [41]:
dim = embeddings_all.shape[1]
index_all = faiss.IndexFlatL2(dim)
index_all.add(np.float32(embeddings_all))

In [46]:
query = texts_all[1]
query

'a Very Popular and Loud Lyric-Focused music titled: Love The Way You Lie, by : Eminem released in : 2010, detroit hip hop Genre Slow Tempo, very High Energy and High Dancability . it has Slight Audience Liveness and Happy/Uplifting Mood. Long Length and Electronic/Produced acousticness'

In [53]:
import pandas as pd

def search(query, index, texts,  number_of_results=3):

  # 1. Get the query's embedding
  query_embed = model.encode(query)

  # 2. Retrieve the nearest neighbors
  distances , similar_item_ids = index.search(np.float32([query_embed]), number_of_results)

  # 3. Format the results
  texts_np = np.array(texts) # Convert texts list to numpy for easier indexing
  results = pd.DataFrame(data={'texts': texts_np[similar_item_ids[0]],
                              'distance': distances[0]})

  # 4. Print and return the results
  print(f"Query:'{query}'\nNearest neighbors:")
  return results

# Searching musics based on user specified properties or using another music from the database

In [54]:
search(query, index_all, texts_all, number_of_results=5)

Query:'a Very Popular and Loud Lyric-Focused music titled: Love The Way You Lie, by : Eminem released in : 2010, detroit hip hop Genre Slow Tempo, very High Energy and High Dancability . it has Slight Audience Liveness and Happy/Uplifting Mood. Long Length and Electronic/Produced acousticness'
Nearest neighbors:


Unnamed: 0,texts,distance
0,a Very Popular and Loud Lyric-Focused music ti...,1.772446e-13
1,a Very Popular and Loud Instrumental music tit...,0.6964276
2,a Very Popular and Moderate Instrumental music...,0.7316025
3,a Moderately Popular and Loud Music-Dominant m...,0.7385463
4,a Moderately Popular and Loud Instrumental mus...,0.7432172


In [56]:
search("a music about Drugs and in genre of hip hop", index_all, texts_all, 10)

Query:'a music about Drugs and in genre of hip hop'
Nearest neighbors:


Unnamed: 0,texts,distance
0,a Moderately Popular and Loud Music-Dominant m...,0.806817
1,a Moderately Popular and Moderate Instrumental...,0.807325
2,a Moderately Popular and Moderate Lyric-Focuse...,0.902022
3,a Very Popular and Moderate Music-Dominant mus...,0.914573
4,a Moderately Popular and Moderate Music-Domina...,0.918303
5,a Moderately Popular and Loud Music-Dominant m...,0.941682
6,a Moderately Popular and Moderate Music-Domina...,0.953582
7,a Very Popular and Loud Music-Dominant music t...,0.954179
8,a Very Popular and Loud Lyric-Focused music ti...,0.960313
9,a Very Popular and Very Loud Music-Dominant mu...,0.963427


In [57]:
search("a very Energetic Loud Music", index_all, texts_all, 10)

Query:'a very Energetic Loud Music'
Nearest neighbors:


Unnamed: 0,texts,distance
0,a Very Popular and Loud Music-Dominant music t...,0.788917
1,a Very Popular and Very Loud Music-Dominant mu...,0.78925
2,a Very Popular and Loud Music-Dominant music t...,0.827111
3,a Very Popular and Loud Instrumental music tit...,0.840889
4,a Very Popular and Loud Music-Dominant music t...,0.844195
5,a Very Popular and Loud Instrumental music tit...,0.866881
6,a Moderately Popular and Loud Instrumental mus...,0.876136
7,a Very Popular and Loud Music-Dominant music t...,0.885094
8,a Moderately Popular and Loud Music-Dominant m...,0.894433
9,a Very Popular and Loud Music-Dominant music t...,0.896305
