In [2]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Load dataset
data = pd.read_csv("processed_songs.csv")

In [13]:
data.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face and ..."
1,ABBA,"Andante, Andante","take it easy with me, please touch me gently..."
2,ABBA,As Good As New,i'll never know why i had to go why i had to...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


In [14]:
data.shape

(57650, 3)

# Use pre-trained Sentence Transformer model for converting data into Embedding

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Save the model
model.save("sentence_transformer_model")

# Convert lyrics to embeddings

In [17]:
data['embedding'] = data['text'].apply(lambda x: model.encode(str(x), convert_to_tensor=True))


# Store embeddings and song details

In [19]:
embeddings = torch.stack(data['embedding'].tolist())
song_info = data[['artist', 'song']]

# Save embeddings and song details
torch.save(embeddings, "embeddings.pt")
song_info.to_csv("song_info.csv", index=False)

# Load embeddings and song details

In [3]:
# Load the saved model
model = SentenceTransformer("sentence_transformer_model")

In [4]:
embeddings = torch.load("embeddings.pt")
song_info = pd.read_csv("song_info.csv")

# Find the most similar song

In [5]:
def find_similar_song(lyrics, top_k=5):
    input_embedding = model.encode(lyrics, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(input_embedding, embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    results = []
    for idx in top_results.indices.tolist():  # Convert tensor indices to list
        results.append({
            "artist": song_info.iloc[idx]['artist'],
            "song": song_info.iloc[idx]['song'],
            "score": similarities[idx].item()
        })

    return results

# **Testing**

In [21]:
data['text'][3000]

"the autumn evenings filled with copper shades   i see the birds' neck in the frame   a figure walks into the sunset   someone goes past suspended from the sky      takes more imagination   when everything's remote control   for me it's just a case of   what's on the far side of the road      tell everybody   i'm going away for ten years   i'm going to wander   among the wicklow hills      the travelling children in their sunday clothes   lost on the corner of the street   fat gypsy lady smacks the windowpane   a farm dog gets out on the motorway      takes more imagination   when everything's remote control   for me it's just a case of   what's on the far side of the road      tell everybody   i'm going away for ten years   i'm going to wander   among the wicklow hills"

In [22]:
data['song'][3000]

'Among The Wicklow Hills'

In [23]:
lyrics = "takes more imagination   when everything's remote control   for me it's just a case of   what's on the far side of the road"
recommendations = find_similar_song(lyrics)
for rec in recommendations:
    print(f"{rec['song']} by {rec['artist']} (Similarity: {rec['score']:.4f})")


Among The Wicklow Hills by Christy Moore (Similarity: 0.5197)
Always Crashing In The Same Car by David Bowie (Similarity: 0.4898)
The Overload by Talking Heads (Similarity: 0.4582)
Open Road by Wishbone Ash (Similarity: 0.4494)
Runaway by Chris Rea (Similarity: 0.4484)


In [26]:
recommendations

[{'artist': 'Christy Moore',
  'song': 'Among The Wicklow Hills',
  'score': 0.519721508026123},
 {'artist': 'David Bowie',
  'song': 'Always Crashing In The Same Car',
  'score': 0.48983657360076904},
 {'artist': 'Talking Heads',
  'song': 'The Overload',
  'score': 0.45820149779319763},
 {'artist': 'Wishbone Ash', 'song': 'Open Road', 'score': 0.4494209885597229},
 {'artist': 'Chris Rea', 'song': 'Runaway', 'score': 0.4484301209449768}]

In [28]:
recommendations[0]['score']

0.519721508026123

In [24]:
max(recommendations, key=lambda x: x['score'])['song']

'Among The Wicklow Hills'

In [25]:
sorted(recommendations, key=lambda x: x['score'])

[{'artist': 'Chris Rea', 'song': 'Runaway', 'score': 0.4484301209449768},
 {'artist': 'Wishbone Ash', 'song': 'Open Road', 'score': 0.4494209885597229},
 {'artist': 'Talking Heads',
  'song': 'The Overload',
  'score': 0.45820149779319763},
 {'artist': 'David Bowie',
  'song': 'Always Crashing In The Same Car',
  'score': 0.48983657360076904},
 {'artist': 'Christy Moore',
  'song': 'Among The Wicklow Hills',
  'score': 0.519721508026123}]