In [None]:
import os
import torch
from transformers import BertTokenizer, BertModel
from google.colab import drive

In [None]:
# Mount Google Drive to access files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the path to the lyrics folder in your Google Drive
lyrics_directory = '/content/drive/MyDrive/NEUROMATCH DL/unpopular_lyrics'

In [None]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Function to read lyrics from a file
def read_lyrics(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [None]:
# Function to get BERT embeddings
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Average embeddings


In [None]:
# Process each lyrics file
embeddings = []
file_paths = [os.path.join(lyrics_directory, file) for file in os.listdir(lyrics_directory) if file.endswith('.txt')]


In [None]:
for file_path in file_paths:
    lyrics = read_lyrics(file_path)
    embedding = get_bert_embeddings([lyrics])  # We pass a list to handle a single text input
    embeddings.append(embedding)

In [None]:
# Concatenate all embeddings into a single tensor
all_embeddings = torch.cat(embeddings, dim=0)

In [None]:
# Optional: Save embeddings to a file in Google Drive
torch.save(all_embeddings, '/content/drive/My Drive/lyrics_embeddings.pt')


In [None]:
print("BERT embeddings generated and saved.")

BERT embeddings generated and saved.


In [None]:
# Print the embedding for the first song
print("Embedding for the first song:", embeddings[0])

Embedding for the first song: tensor([[-8.4069e-02, -4.4498e-02,  4.4018e-01, -2.1197e-01,  1.8076e-01,
          2.3916e-02,  3.8009e-01,  2.5948e-01, -1.2901e-01,  4.6508e-02,
          9.3832e-02, -2.2220e-01, -3.6871e-01,  1.4238e-01, -2.3483e-02,
          6.5359e-01,  2.2057e-01, -2.8473e-01, -2.4426e-01,  1.8413e-01,
          6.9675e-01,  1.3110e-01,  1.9037e-01,  3.7568e-01,  3.4130e-01,
          2.8082e-01, -1.4971e-01, -2.2289e-01, -2.2451e-01, -2.3374e-01,
          3.7942e-01, -4.6079e-03, -2.6042e-01, -5.4458e-01, -1.2241e-02,
         -6.8100e-02, -2.0799e-02, -1.1847e-01,  6.2312e-02,  1.4197e-01,
         -5.0307e-01, -4.8403e-01, -1.9591e-01,  9.3062e-02, -3.9431e-01,
         -3.6757e-01,  1.7180e-01,  5.5311e-02,  6.7762e-02, -9.3565e-02,
         -1.4694e-02,  2.9622e-01, -3.4138e-01, -3.0429e-01,  1.5572e-02,
          5.8267e-01,  2.9282e-01, -4.6675e-01, -3.9748e-01, -2.8853e-01,
          1.2198e-01, -2.0436e-01,  5.1909e-01, -6.0413e-02, -4.1458e-02,
        

#Cosine similarity from last two words from each line of lyrics

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Function to extract last two words from each line of lyrics
def extract_last_two_words(lines):
    last_two_words = []
    for line in lines:
        words = line.strip().split()
        if len(words) >= 2:
            last_two_words.append(' '.join(words[-2:]))
    return last_two_words

In [None]:
# Process each lyrics file and extract last two words
last_two_words_list = []
file_paths = [os.path.join(lyrics_directory, file) for file in os.listdir(lyrics_directory) if file.endswith('.txt')]


In [None]:
for file_path in file_paths:
    lyrics = read_lyrics(file_path)
    lines = lyrics.splitlines()
    last_two_words = extract_last_two_words(lines)
    last_two_words_list.extend(last_two_words)

In [None]:
# Get embeddings for the last two words of each line
embeddings = get_bert_embeddings(last_two_words_list)


In [None]:
# Calculate cosine similarities between embeddings
similarities = cosine_similarity(embeddings)

In [None]:
# Function to find and display similar patterns
def find_similar_patterns(similarity_matrix, texts, threshold=0.9):
    similar_pairs = []
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            if similarity_matrix[i, j] > threshold:
                similar_pairs.append((texts[i], texts[j], similarity_matrix[i, j]))
    return similar_pairs


In [None]:
# Find similar patterns with a similarity threshold (e.g., 0.9)
similar_patterns = find_similar_patterns(similarities, last_two_words_list, threshold=0.9)


In [None]:
# Display similar patterns
for pair in similar_patterns:
    print(f"'{pair[0]}' and '{pair[1]}' are similar with a similarity score of {pair[2]:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
'the storm' and 'the rain' are similar with a similarity score of 0.95
'the storm' and 'the wind' are similar with a similarity score of 0.94
'the storm' and 'the lightning' are similar with a similarity score of 0.97
'me so' and 'me now' are similar with a similarity score of 0.92
'me so' and 'me now' are similar with a similarity score of 0.92
'me so' and 'me now' are similar with a similarity score of 0.92
'me so' and 'me caí' are similar with a similarity score of 0.92
'me so' and 'me now' are similar with a similarity score of 0.92
'me so' and 'me now' are similar with a similarity score of 0.92
'me so' and 'me crazy' are similar with a similarity score of 0.91
'your nose' and 'your skeleton' are similar with a similarity score of 0.92
'your nose' and 'your toes' are similar with a similarity score of 0.91
'your nose' and 'your toes' are similar with a similarity score of 0.91
'your nose' and 'your toes' are similar 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plotting the cosine similarity matrix
plt.figure(figsize=(10, 8))
sns.heatmap(similarities, xticklabels=last_two_words_list, yticklabels=last_two_words_list, cmap='viridis')
plt.title('Cosine Similarity Heatmap')
plt.xlabel('Text Pairs')
plt.ylabel('Text Pairs')
plt.show()