# Sentece BERT for Lyrical Similarity 

Here, a Sentence BERT model is applyed to attain the similarity of the the unique songs of the DS Project until the end of the Year 2021.

## Loading packages and Models

We are using a pretrained BERT Model since our dataset is vast and does not contain a subject-specific vocabulary.

In [None]:
!pip install transformers

In [None]:
!pip install sentence-transformers

In [None]:
import torch 
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np 


In [None]:
# using a pre-trained Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
# using a pre-trained Sentence Transformer BERT Model 
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


## Getting the (cleaned) Data

In [None]:
## mounting google drive

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# reading in data 

song_data =  pd.read_csv("/content/gdrive/My Drive/DS Projekt/data_for_BERT.csv",  encoding='latin-1')

song_data.head()
song_data.shape

In [None]:
song_data.tail()

In [None]:
 song_data['lyrics'].isnull().values.any() ## awesome - so preprocessing has worked ! :D

## Applying the Sentence BERT Model 

In [None]:
# getting the embeddings for all songs

all_lyrics = list(song_data.lyrics)
print("The amount of songs considered for the Sentence BERT Model is", str(len(all_lyrics)), "!")

In [None]:
all_embeddings = model.encode(all_lyrics)
all_embeddings.shape # 768 per song


Now, we can store the embeddings together with the combination and genre

In [None]:
all_embeddings

In [None]:
all_embeddings_df = pd.DataFrame(all_embeddings)


In [None]:
all_embeddings_df["combination"] = song_data.combination
all_embeddings_df["genre"] = song_data.genre


In [None]:
# saving to Drive 

all_embeddings_df.to_csv('/content/gdrive/My Drive/DS Projekt/all_embeddings_df.csv')

In [None]:
## cell for leading in embeddings once the code above has been run!

all_embeddings_df = pd.read_csv("/content/gdrive/My Drive/DS Projekt/all_embeddings_df.csv",  encoding='latin-1')

print(all_embeddings_df.shape)


In [None]:
all_embeddings_df1 = all_embeddings_df.iloc[1:768]

## Analysis 

### Similarity scores across all songs

In [None]:
embeddings = np.array(all_embeddings_df.iloc[0:24585,1:769])


In [None]:
embeddings.shape

In [None]:
from sentence_transformers.util import cos_sim

#sim = np.zeros([embeddings.shape[0], embeddings.shape[0] ])


In [None]:
# max 3 values of out similarity list per song!

#sim_songs_1 = []
#sim_songs_2 = []
#sim_songs_3 = []


In [None]:
# reading in already calculated data 


df_sims = pd.read_csv('/content/gdrive/My Drive/DS Projekt/df_sims.csv')


sim_songs_1 = list(df_sims.s1)
sim_songs_2 = list(df_sims.s2)
sim_songs_3 = list(df_sims.s3)

In [None]:
len(sim_songs_1)

In [None]:

for i in range(len(sim_songs_1),embeddings.shape[0]):
  
  #embeddings.shape[0]):
  list_sims = np.ones(embeddings.shape[0])

  print(i)
  for j in range(embeddings.shape[0]):
    list_sims[j] = cos_sim(embeddings[i], embeddings[j])

  # same song out 
  list_sims[i] = -100

  # top 1 
  top1 = np.argmax(list_sims)

  # top 2 
  list_sims[top1] = -100
  top2 = np.argmax(list_sims)

  # top 3
  list_sims[top2] = -100
  top3 = np.argmax(list_sims)

  sim_songs_1.append(str(all_embeddings_df.combination[top1] ))
  sim_songs_2.append(str(all_embeddings_df.combination[top2] ))
  sim_songs_3.append(str(all_embeddings_df.combination[top3] ))

  if i == embeddings.shape[0]:
    df_sims = pd.DataFrame({"s1":sim_songs_1, "s2":sim_songs_2, "s3":sim_songs_3})
    df_sims.to_csv('/content/gdrive/My Drive/DS Projekt/df_sims.csv')

  if (i > 0) & ((i % 400) == 0):
    df_sims = pd.DataFrame({"s1":sim_songs_1, "s2":sim_songs_2, "s3":sim_songs_3})
    df_sims.to_csv('/content/gdrive/My Drive/DS Projekt/df_sims.csv')

#sim.shape

assigning top 3 most similar lyrics per song lyric


In [None]:
df_sims = pd.DataFrame({"s1":sim_songs_1, "s2":sim_songs_2, "s3":sim_songs_3})
df_sims.shape
df_sims.to_csv('/content/gdrive/My Drive/DS Projekt/df_sims.csv')


## Merging the string with most similar songs bac togther with the song data

In [None]:
df_sims = pd.read_csv('/content/gdrive/My Drive/DS Projekt/df_sims.csv')
df_sims.shape ## gleich ? wie all_embeddings ?? 



In [None]:
all_embeddings_df["s1"] = df_sims.s1
all_embeddings_df["s2"] = df_sims.s2
all_embeddings_df["s3"] = df_sims.s3



In [None]:
song_data.shape

In [None]:
## lyrics index liste 
df_lyrics = song_data[["combination", "lyrics"]]


In [None]:
# init indices in ambeddings data 

all_embeddings_df["index_1"] = 0
all_embeddings_df["index_2"] = 0
all_embeddings_df["index_3"] = 0


In [None]:
df_lyrics["idx"] = list(range(24585))

In [None]:
all_embeddings_df.index_1[i] =  df_lyrics.index[df_lyrics.combination == all_embeddings_df.s1[i] ]


In [None]:
for i in range(all_embeddings_df.shape[0]):
  all_embeddings_df.index_1[i] =  list(df_lyrics.idx[df_lyrics.combination == all_embeddings_df.s1[i] ])[0]
  all_embeddings_df.index_2[i] =  list(df_lyrics.idx[df_lyrics.combination == all_embeddings_df.s2[i] ])[0]
  all_embeddings_df.index_3[i] =  list(df_lyrics.idx[df_lyrics.combination == all_embeddings_df.s3[i] ])[0]
  print(i)

In [None]:
# checking for different numbers 

i_s = [20,100,1000,2000,3000,5000,7000,15000,20000]

for i in i_s:

  print("1")
  print(all_embeddings_df.s1[i])
  print(df_lyrics.combination[df_lyrics.idx == all_embeddings_df.index_1[i] ])

  print("2")
  print(all_embeddings_df.s2[i])
  print(df_lyrics.combination[df_lyrics.idx == all_embeddings_df.index_2[i] ])

  print("3")
  print(all_embeddings_df.s3[i])
  print(df_lyrics.combination[df_lyrics.idx == all_embeddings_df.index_3[i] ])



 Awesome!

### Dimensionality Reduction

In [None]:
embeddings.shape

In [None]:
## PCA and plotting for fun 

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
  

In [None]:
## applying PCA
principalComponents = pca.fit_transform(embeddings)


In [None]:
all_embeddings_df["p1"] = principalComponents[:,0]
all_embeddings_df["p2"] = principalComponents[:,1]


In [None]:

all_embeddings_df.columns.values.tolist()

In [None]:
## removing-non-needed columns 

similarity_resutls_df = all_embeddings_df[["combination", "s1", "s2", "s3", "p1", "p2", "genre", "index_1", "index_2", "index_3"]]
similarity_resutls_df.shape

## Adding in the first date!

In [None]:
similarity_resutls_df["first_appearance"] = song_data.first_appearance

## Using one DF for topics modelling and one called "extra" for the usage in the similarity Page

In [None]:
## removing not known genres for topic modelling

similarity_df = similarity_resutls_df[similarity_resutls_df.genre != "unknown genre"]

similarity_df.to_csv('/content/gdrive/My Drive/DS Projekt/similarity.csv')



In [None]:
## usage in similarity page 

similarity_resutls_df.to_csv('/content/gdrive/My Drive/DS Projekt/similarity_extra.csv')
df_lyrics.to_csv('/content/gdrive/My Drive/DS Projekt/df_lyrics.csv')
