**Инсталляция отсутствующих на колабе библиотек**

In [1]:
!pip install -q sentence-transformers musicbrainzngs

**Импорты необходимых библиотек**

In [2]:
import pandas as pd
import numpy as np

from google.colab import drive

from tqdm import notebook, tqdm

import re
import musicbrainzngs

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings('ignore')

import torch

from sentence_transformers import SentenceTransformer, util

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("max_colwidth", 100)

RANDOM = 12345
torch.manual_seed(RANDOM)
np.random.seed(RANDOM)

**Подключение google диска**

In [3]:
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


**Переход в папку с json файлами**

In [4]:

%cd /content/drive/My Drive/to_upload/

/content/drive/My Drive/to_upload


**Путь к файлам**

In [5]:
path = os.path.abspath(os.curdir)
path

'/content/drive/My Drive/to_upload'

**Функция проверки загрузки**

In [6]:
def display_dataset_info(dataset):
    '''
    Функция проверки загрузки: вывод первых 5 строк, информации о датафрейме и наличии дубликатов.

    Параметры:

     dataset: исследуемый датасет.

    '''

    display(dataset.head())
    print()
    dataset.info()

**Загрузка датасетов**

In [7]:
meta = pd.read_json(os.path.join(path, 'meta.json'), orient='columns', lines = True)
lyrics = pd.read_json(os.path.join(path, 'lyrics.json'), orient='columns', lines = True)
covers = pd.read_json(os.path.join(path, 'covers.json'), orient='columns', lines = True)

**Датасет с метаданными**

In [8]:
display_dataset_info(meta)

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration
0,c3b9d6a354ca008aa4518329aaa21380,1639688000000.0,Happy New Year,EN,RUB422103970,[DANCE],161120.0
1,c57e3d13bbbf5322584a7e92e6f1f7ff,1637762000000.0,Bad Habits,EN,QZN882178276,[ELECTRONICS],362260.0
2,955f2aafe8717908c140bf122ba4172d,1637768000000.0,Por Esa Loca Vanidad,,QZNJZ2122549,"[FOLK, LATINFOLK]",260000.0
3,fae5a077c9956045955dde02143bd8ff,1637768000000.0,Mil Lagrimas,,QZNJZ2166033,"[FOLK, LATINFOLK]",190000.0
4,6bede082154d34fc18d9a6744bc95bf5,1637768000000.0,Sexo Humo y Alcohol,,QZNJZ2122551,"[FOLK, LATINFOLK]",203000.0



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71769 entries, 0 to 71768
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   track_id  71768 non-null  object 
 1   dttm      71768 non-null  float64
 2   title     71768 non-null  object 
 3   language  21969 non-null  object 
 4   isrc      71455 non-null  object 
 5   genres    71768 non-null  object 
 6   duration  71768 non-null  float64
dtypes: float64(2), object(5)
memory usage: 3.8+ MB


**Распаковка списка с жанрами в строку**

In [9]:
def unpack_list(row):

    '''
    Функция для распаковки значений списка в столбце genres в датасете meta.

    '''

    if row is None:
        return "UNKNOWN"
    elif len(row) == 0:
        return "UNKNOWN"
    elif len(row) == 1:
        return row[0]
    else:
        return ", ".join(row)

In [10]:
tqdm.pandas(desc="Распаковка списков в столбце genres.")
meta["genres"] = meta["genres"].progress_apply(unpack_list)

Распаковка списков в столбце genres.: 100%|██████████| 71769/71769 [00:00<00:00, 384913.23it/s]


**Датасет с текстами песен**

In [11]:
display_dataset_info(lyrics)

Unnamed: 0,lyricId,text,track_id
0,a951f9504e89759e9d23039b7b17ec14,"Живу сейчас обломами, обломками не той любви\nПопытками не то любить, что нужно\nТеряю смысл, ну...",1c4b1230f937e4c548ff732523214dcd
1,0c749bc3f01eb8e6cf986fa14ccfc585,Tell me your fable\nA fable\nTell me your fable\nTell me your fable\nTell me your fable\nA fable...,0faea89b0d7d6235b5b74def72511bd8
2,e2c8830fbc86e5964478243099eec23a,You're ashamed about all your fears and doubts\nAnd how I hurt you\nCan you make it back from th...,9c6dc41d5ccd9968d07f055da5d8f741
3,e2c8830fbc86e5964478243099eec23a,You're ashamed about all your fears and doubts\nAnd how I hurt you\nCan you make it back from th...,bfd04a73e9cffdf0e282c92219a86ea1
4,7624653ca8522ba93470843c74961b7d,"You showed him all the best of you,\nBut I'm afraid your best wasn't good enough\nAnd know he ne...",8d70930d09cd239c948408d1317d8659



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11414 entries, 0 to 11413
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   lyricId   11414 non-null  object
 1   text      11414 non-null  object
 2   track_id  11414 non-null  object
dtypes: object(3)
memory usage: 267.6+ KB


**Словарь для расшифровки сокращённых форм при очистке текста**

In [12]:
contraction_dict = {
    "ain't": "is not",
    "Ain't": "Is not",
    "aren't": "are not",
    "Aren't": "Are not",
    "can't": "can not",
    "Can't": "Can not",
    "could've": "could have",
    "Could've": "Could have",
    "couldn't": "could not",
    "Couldn't": "Could not",
    "didn't": "did not",
    "Didn't": "Did not",
    "doesn't": "does not",
    "Doesn't": "Does not",
    "don't": "do not",
    "Don't": "Do not",
    "hadn't": "had not",
    "Hadn't": "Had not",
    "hasn't": "has not",
    "Hasn't": "Has not",
    "haven't": "have not",
    "Haven't": "Have not",
    "he'd": "he would",
    "He'd": "He would",
    "he'll": "he will",
    "He'll": "He will",
    "he's": "he is",
    "He's": "He is",
    "i'd": "i would",
    "I'd": "I would",
    "i'll": "i will",
    "I'll": "I will",
    "i'm": "i am",
    "I'm": "I am",
    "i've": "i have",
    "I've": "I have",
    "isn't": "is not",
    "Isn't": "Is not",
    "it's": "it is",
    "It's": "It is",
    "let's": "let us",
    "Let's": "Let us",
    "might've": "might have",
    "Might've": "Might have",
    "must've": "must have",
    "Must've": "Must have",
    "mustn't": "must not",
    "Mustn't": "Must not",
    "shan't": "shall not",
    "Shan't": "Shall not",
    "she'd": "she would",
    "She'd": "She would",
    "she'll": "she will",
    "She'll": "She will",
    "she's": "she is",
    "She's": "She is",
    "should've": "should have",
    "Should've": "Should have",
    "shouldn't": "should not",
    "Shouldn't": "Should not",
    "that's": "that is",
    "That's": "That is",
    "there's": "there is",
    "There's": "There is",
    "they'd": "they would",
    "They'd": "They would",
    "they'll": "they will",
    "They'll": "They will",
    "they're": "they are",
    "They're": "They are",
    "they've": "they have",
    "They've": "They have",
    "wasn't": "was not",
    "Wasn't": "Was not",
    "we'd": "we would",
    "We'd": "We would",
    "we'll": "we will",
    "We'll": "We will",
    "we're": "we are",
    "We're": "We are",
    "we've": "we have",
    "We've": "We have",
    "weren't": "were not",
    "Weren't": "Were not",
    "what's": "what is",
    "What's": "What is",
    "won't": "will not",
    "Won't": "Will not",
    "would've": "would have",
    "Would've": "Would have",
    "wouldn't": "would not",
    "Wouldn't": "Would not",
    "you'd": "you would",
    "You'd": "You would",
    "you'll": "you will",
    "You'll": "You will",
    "you're": "you are",
    "You're": "You are",
    "you've": "you have",
    "You've": "You have",
    "i'd've": "i would have",
    "I'd've": "I would have",
    "she'd've": "she would have",
    "She'd've": "She would have",
    "should've": "should have",
    "Should've": "Should have",
    "could've": "could have",
    "Could've": "Could have",
    "might've": "might have",
    "Might've": "Might have",
    "would've": "would have",
    "Would've": "Would have"
}

**Функция очистки текста**

In [13]:
def clean_text(text):
    # убираем перенос строки
    text = text.replace("\n", ". ")
    # ,. меняется на .
    text = text.replace(",.", ".")

    # распаковываем сокращенные формы в полные согласно словарю
    for contraction, expansion in contraction_dict.items():
        text = text.replace(contraction, expansion)
    # текст в нижний регистр
    text = text.lower()

    return text

# По очистке текста тут еще много чего придумать можно...

In [14]:
tqdm.pandas(desc="Очистка текста.")
lyrics["text"] = lyrics["text"].progress_apply(clean_text)

Очистка текста.: 100%|██████████| 11414/11414 [00:03<00:00, 3682.46it/s]


**Датасет с разметкой оригинал/кавер**

In [15]:
display_dataset_info(covers)

Unnamed: 0,original_track_id,track_id,track_remake_type
0,eeb69a3cb92300456b6a5f4162093851,eeb69a3cb92300456b6a5f4162093851,ORIGINAL
1,fe7ee8fc1959cc7214fa21c4840dff0a,fe7ee8fc1959cc7214fa21c4840dff0a,ORIGINAL
2,cd89fef7ffdd490db800357f47722b20,cd89fef7ffdd490db800357f47722b20,ORIGINAL
3,995665640dc319973d3173a74a03860c,995665640dc319973d3173a74a03860c,ORIGINAL
4,,d6288499d0083cc34e60a077b7c4b3e1,COVER



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71597 entries, 0 to 71596
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   original_track_id  4821 non-null   object
 1   track_id           71597 non-null  object
 2   track_remake_type  71597 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


**Объединение датасетов**

In [16]:
data = pd.merge(covers, meta[['track_id', 'title', 'language', 'isrc', 'genres']], how='left', on=['track_id'])

In [17]:
# датасет с текстами объединяем как inner, чтобы оставить только строки с текстами
data = pd.merge(data, lyrics[["text", "track_id"]], how='inner', on=['track_id'])

In [18]:
# меняем столбцы местами
cols = ['original_track_id', 'track_id', 'isrc', 'title', 'language', 'genres',  'text', 'track_remake_type']
data = data[cols]

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11097 entries, 0 to 11096
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   original_track_id  3599 non-null   object
 1   track_id           11097 non-null  object
 2   isrc               11018 non-null  object
 3   title              11097 non-null  object
 4   language           7069 non-null   object
 5   genres             11097 non-null  object
 6   text               11097 non-null  object
 7   track_remake_type  11097 non-null  object
dtypes: object(8)
memory usage: 780.3+ KB


**Удаляем дубликаты по isrc**

In [20]:
data = data.drop_duplicates(subset=['isrc'])

In [21]:
data[data["title"] == "My Favourite Game"]

Unnamed: 0,original_track_id,track_id,isrc,title,language,genres,text,track_remake_type
13,80a160ff31266be2f93012a2a3eca713,80a160ff31266be2f93012a2a3eca713,SEBKB9802010,My Favourite Game,EN,"POP, ROCK, ALLROCK","i do not know what you are looking for. you have not found it baby, that is for sure. you rip me...",ORIGINAL
2070,,741ac802ee70410ff2871fee497ce2ef,US94P1414009,My Favourite Game,,"METAL, ALTERNATIVEMETAL","i do not know what you are looking for. you have not found it, baby, that is for sure. you rip m...",COVER


In [22]:
# data["track_remake_type"] = data["track_remake_type"].map({'ORIGINAL': 0, 'COVER': 1})

**SentenceTransformer**

В модуле SentenceTransformer есть предобученная модель `sentence-transformers/LaBSE`, которая воспринимает много языков и создаёт эмбединги так, что при сравнении например одног слова на разных языках вектора этих слов будут максимально близки.

[LaBSE](https://huggingface.co/sentence-transformers/LaBSE)

In [23]:
# проверяем наличие gpu
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda', index=0)

**Инициализация модели**

In [24]:
model = SentenceTransformer('sentence-transformers/LaBSE')

**Список с текстами песен**

In [25]:
all_texts = list(data["text"])

**Векторизация**

In [26]:
embedding_all_texts = model.encode(all_texts, show_progress_bar=True, device=device, batch_size=32)

Batches:   0%|          | 0/312 [00:00<?, ?it/s]

**Функция получения имени исполнителя через isrc c помощью api musicbrainzngs**

In [27]:
def parse_singer(row):

    # инициализация подключения
    musicbrainzngs.set_useragent("python-musicbrainzngs-example", "0.1", "https://github.com/alastair/python-musicbrainzngs/",)

    try:
        # получение данных по isrc
        name = musicbrainzngs.get_recordings_by_isrc(row,  includes=['artists'])
        # получение имени исполнителя
        res = name["isrc"]["recording-list"][0]["artist-credit"][0]["artist"]["name"]
    except:
        res = "Unknown"

    return res

**Функция поиска оригинала и каверов при наличии их в датасете**

In [30]:
def search_covers(song_name=None, embedding_all_texts=None, device="cpu", dataset=None, parsing=None):

    data_temp = dataset.copy(deep=True)

    if song_name.lower() not in list(dataset["title"].str.lower()):

      return print(f"Трека: {song_name} нету в нашей базе данных..")

    # получаем из датасета текст песни
    text = dataset.loc[dataset["title"].str.lower() == song_name.lower(), "text"].reset_index().iloc[0, 1]
    # веторизируем его
    embedding_text = model.encode(text, device=device, batch_size=1)
    # получение информации по близости векторов
    score = util.semantic_search(embedding_text, embedding_all_texts, top_k=10)[0]
    # остекаем тексты с близостью менее 0.9
    lst_idx = [score[i]["corpus_id"] for i in range(len(score)) if score[i]["score"] > 0.9]
    # создаем результирующий датасет с нужными треками
    data_temp = data.iloc[lst_idx]
    # ищем имя исполнителя на сайте musicbrainzngs
    data_temp["singer"] = data_temp["isrc"].apply(parsing)

    data_temp = data_temp.drop(["original_track_id", "track_id", "language", "text"], axis=1)

    return data_temp

**Проверка работы функции для разных песен**

In [31]:
data_1 = search_covers(song_name="Oops I Did It Again", embedding_all_texts=embedding_all_texts, device=device, dataset=data, parsing=parse_singer)

In [32]:
data_1

Unnamed: 0,isrc,title,genres,track_remake_type,singer
7985,QZHZ62113597,Oops I Did It Again,POP,COVER,Unknown
8050,QZHZ62144078,Oops I Did It Again,POP,COVER,Unknown
7619,GBKPL2149328,Oops!...I Did It Again,FOLK,COVER,Unknown
7576,UK6KW2100211,Oops!... I Did It Again,"SOUL, RNB",COVER,Unknown
47,FISFS0500164,Oops!...I Did It Again,"METAL, DEATHMETAL",COVER,Children of Bodom
210,FIUM70900705,Oops... I Did It Again,"METAL, DEATHMETAL",COVER,Children of Bodom


In [33]:
data_2 = search_covers(song_name="My Favourite Game", embedding_all_texts=embedding_all_texts, device=device, dataset=data, parsing=parse_singer)

In [34]:
data_2

Unnamed: 0,isrc,title,genres,track_remake_type,singer
13,SEBKB9802010,My Favourite Game,"POP, ROCK, ALLROCK",ORIGINAL,The Cardigans
2070,US94P1414009,My Favourite Game,"METAL, ALTERNATIVEMETAL",COVER,Unknown


In [35]:
data_3 = search_covers(song_name="ของขวัญ", embedding_all_texts=embedding_all_texts, device=device, dataset=data, parsing=parse_singer)

In [36]:
data_3

Unnamed: 0,isrc,title,genres,track_remake_type,singer
4126,THWTD1801600,ของขวัญ,POP,COVER,Unknown


In [37]:
data_4 = search_covers(song_name="Wind of Change", embedding_all_texts=embedding_all_texts, device=device, dataset=data, parsing=parse_singer)

In [38]:
data_4

Unnamed: 0,isrc,title,genres,track_remake_type,singer
26,USPG19090037,Wind Of Change,"HARDROCK, ALLROCK",ORIGINAL,Scorpions
9513,USDPK2100153,Wind of Change,ALTERNATIVE,COVER,AWOLNATION
9876,DEYO62200029,Wind of Change,"METAL, METALCOREGENRE",COVER,Unknown


In [39]:
data_5 = search_covers(song_name="Wind of not Change", embedding_all_texts=embedding_all_texts, device=device, dataset=data, parsing=parse_singer)

Трека: Wind of not Change нету в нашей базе данных..
