<a href="https://colab.research.google.com/github/MulhamShaheen/AI-DJ/blob/dev%2Fsearch/prototype/NLP/AI_DJ_NLP_search_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets faiss-cpu faiss-gpu --quiet

In [None]:
import torch
from transformers import BertModel, BertTokenizerFast
import torch.nn.functional as F
import pandas as pd
from datasets import Dataset

In [None]:
model_checkpoint = 'setu4993/LEALLA-base'

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("setu4993/LEALLA-small")
model = BertModel.from_pretrained("setu4993/LEALLA-small")
model = model.eval()

In [None]:
df_lyrics = pd.read_csv('https://raw.githubusercontent.com/MulhamShaheen/AI-DJ/main/prototype/NLP/data/lyrics.csv')
song_lyrics = df_lyrics['lyrics'].tolist()

In [None]:
df_lyrics

Unnamed: 0,lyrics
0,Is this the real life? Is this just fantasy?\n...
1,"Hey Jude, don't make it bad.\nTake a sad song ..."
2,Мне надоело петь про эту заграницу\nНадену вал...
3,"Всё не то, всё не так, ты мой друг, я твой вра..."
4,"Hiya, Barbie\nHi, Ken\nYou want to go for a ri..."
5,Бетономешалка\nБетономешалка мешает бетон!\nБр...


In [None]:
def get_embeddings(texts: list[str], normalize=True):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.pooler_output

    if normalize:
        normalized_embeddings = F.normalize(embeddings, p=2)
        return normalized_embeddings
    else:
        return embeddings

In [None]:
lyrics_dataset = Dataset.from_pandas(df_lyrics)
lyrics_dataset = lyrics_dataset.map(lambda x: {'text_embeddings': get_embeddings(x['lyrics'])[0]})
lyrics_dataset

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Dataset({
    features: ['lyrics', 'embeddings'],
    num_rows: 6
})

In [None]:
lyrics_dataset.add_faiss_index(column='embeddings')

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['lyrics', 'embeddings'],
    num_rows: 6
})

In [None]:
scores, samples = lyrics_dataset.get_nearest_examples('embeddings', get_embeddings(['бухгалтер']).cpu().detach().numpy())

In [None]:
songs_inputs = tokenizer(song_lyrics, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    songs_outputs = model(**songs_inputs)

songs_embeddings = songs_outputs.pooler_output

In [None]:
query_texts = ['хочу песню про деньги']

query_inputs = tokenizer(query_texts, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    query_outputs = model(**query_inputs)

query_embeddings = query_outputs.pooler_output

In [None]:
def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )

In [None]:
similarity(query_embeddings, songs_embeddings)

tensor([[-0.0940, -0.0082,  0.1413,  0.0834, -0.0398,  0.0354]])