In [1]:
from utils import get_artist, get_biggest_arts
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoTokenizer
from bert_lIghtning import songs_from_artists
from transformers import DataCollatorWithPadding
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import gc
from torch.cuda import empty_cache

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")


MODEL_NAME = "distilbert/distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
top10 = get_biggest_arts(10)
top30 = get_biggest_arts(30)

modes = ["solo", "features"]
song_limits_10 = [110, 180]
song_limits_30 = [190,300]
arts_lists = [top10, top30]
combined = [(top30,"solo",110), (top30,"features",180)] + [(top10,"solo", 190), (top10,"features",300)]



  from .autonotebook import tqdm as notebook_tqdm
2024-06-28 07:51:42.413123: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-28 07:51:42.439211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 1049/1049 [00:04<00:00, 252.24it/s].58it/s]
sorting artists by lyrics length: 1049it [00:04, 251.89it/s]
100%|██████████| 1049/1049 [00:05<00:00, 199.13it/s].75it/s]
sorting artists by lyrics length: 1049it [00:05, 198.90it/s]


In [2]:
for arts_list, mode, song_limit in combined:
    label2id = {label: i for i, label in enumerate((a.name_sanitized for a in arts_list))}
    data = songs_from_artists(arts_list, tokenizer, label2id=label2id, mode=mode, song_limit=song_limit)
    id2label = {i: label for label, i in label2id.items()}
    df = pd.DataFrame(data)
    train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
    train_df_dict = train_df.to_dict(orient='records')
    test_df_dict = test_df.to_dict(orient='records')
    test_dataset, train_dataset = Dataset.from_list(test_df_dict), Dataset.from_list(train_df_dict)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(label2id.keys()), id2label=id2label, label2id=label2id
        )
    output_dir = f"models/{len(arts_list)}_{mode}_{song_limit}"
    print(output_dir, len(train_dataset), len(test_dataset))
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=6,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        # remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    trainer.save_model(output_dir)
    del model
    del trainer
    gc.collect()
    empty_cache()

Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models/30_solo_110 6117 1080


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkatnak56[0m ([33mfirst_throw[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.864948,0.211111
2,3.157900,2.446953,0.338889
3,2.429100,2.144733,0.40463
4,1.910800,1.897428,0.488889
5,1.910800,1.831291,0.489815
6,1.491700,1.749066,0.52037


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models/30_features_180 10354 1828


Epoch,Training Loss,Validation Loss,Accuracy
1,3.193,2.721918,0.236324
2,2.653,2.274861,0.370897
3,2.2597,1.983755,0.432713
4,1.5483,1.825826,0.486324
5,1.2752,1.702107,0.517505
6,1.1068,1.677769,0.522429


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models/10_solo_190 3264 577


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.565342,0.462738
2,No log,1.192296,0.599653
3,1.562800,0.939511,0.679376
4,1.562800,0.748583,0.750433
5,0.621400,0.707688,0.759099
6,0.621400,0.669406,0.774697


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models/10_features_300 5602 989


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.485731,0.480283
2,1.765600,1.155149,0.587462
3,0.997400,0.983166,0.656218
4,0.997400,0.834856,0.707786
5,0.571200,0.864143,0.696663
6,0.356000,0.78309,0.73913


In [3]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch.nn.functional as F
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=10, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.nll_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [6]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
del model
del trainer
gc.collect()
empty_cache()

In [None]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'label', '__index_level_0__'],
    num_rows: 1676
})

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'label', '__index_level_0__'],
    num_rows: 6702
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
input_ids, attention_masks = transform_text(example_song_lyrics[0], tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1269 > 512). Running this sequence through the model will result in indexing errors


In [None]:
input_ids.shape, attention_masks.shape

(torch.Size([3, 512]), torch.Size([3, 512]))

In [None]:
for song in get_artist("Tede").songs[:100]:
    print(song.title)
    input_ids, attention_masks = transform_text(song.get_clean_song_lyrics(), tokenizer)
    print(input_ids.shape, attention_masks.shape)

#hot16challenge
torch.Size([2, 512]) torch.Size([2, 512])
Wyje Wyje Bane
torch.Size([3, 512]) torch.Size([3, 512])
Rainman
torch.Size([3, 512]) torch.Size([3, 512])
Michael Kors
torch.Size([2, 512]) torch.Size([2, 512])
#CTZK
torch.Size([2, 512]) torch.Size([2, 512])
Wunder-Baum
torch.Size([2, 512]) torch.Size([2, 512])
Ostatnia Noc
torch.Size([2, 512]) torch.Size([2, 512])
Pażałsta
torch.Size([2, 512]) torch.Size([2, 512])
Biełyje Nosy
torch.Size([3, 512]) torch.Size([3, 512])
T-Killa
torch.Size([3, 512]) torch.Size([3, 512])
Forever Ja
torch.Size([3, 512]) torch.Size([3, 512])
Brodaggacio
torch.Size([2, 512]) torch.Size([2, 512])
69 Ziomeczków
torch.Size([4, 512]) torch.Size([4, 512])
#COHF
torch.Size([2, 512]) torch.Size([2, 512])
Kot Gigant
torch.Size([2, 512]) torch.Size([2, 512])
Tough Love
torch.Size([3, 512]) torch.Size([3, 512])
CMRT
torch.Size([2, 512]) torch.Size([2, 512])
Feat. (+ Introdukcja)
torch.Size([3, 512]) torch.Size([3, 512])
Drin za drinem
torch.Size([3, 512]) tor

In [None]:
from utils import get_biggest_arts



In [None]:
top30_arts = get_biggest_arts(30)
songs = [song for art in top30_arts for song in art.songs[:200] if song.get_clean_song_lyrics() != ""]

In [None]:
testing = chunk_text(example_song_lyrics[0], tokenizer)[2]

In [None]:
testing == tokenizer.decode(tokenizer.encode(testing, add_special_tokens=False, truncation=False, return_tensors='pt')[0])

False

In [None]:
testing_encoded = tokenizer.encode(testing, add_special_tokens=False, truncation=False, return_tensors='pt')
testing_decoded = tokenizer.decode(testing_encoded[0])
print(testing_encoded)
print(testing_decoded)
print(testing)

tensor([[ 1001,  1001, 12098,  6200,  6583,  2480,  4213, 24098,  2666,  8962,
          6633, 17491,  3217,  3676, 27838,  9761,  5004,  1010,  2000,  6448,
          4355,  2401,  1039,  4143,  6342, 14855,  2243, 29250,  2480,  2532,
          1062, 13476,  6200,  2278,  1012,  1012,  1012,  1012, 13970, 14756,
          7367, 11968,  2063,  5207,  5004, 21469,  2050,  1055,  2480,  9739,
          2226, 12849, 23344,  2102,  8034,  2022,  2480,  2933,  2226,  8945,
          5003,  2213, 24185, 19666,  2100,  5353, 14855,  2243,  2793,  2050,
         17235,  1052, 22123,  6305,  6583, 12170, 11283,  1039,  9096, 24185,
          2094,  3489,  1039,  9096,  1105, 17994,  2063,  1029,  2000,  2026,
         24185, 13728,  2100,  1105, 17994,  2063,  1105, 17994,  2050, 14768,
          1010,  5939,  1010, 22064,  3501,  1055, 18818, 17994,  6305,  6187,
         18818,  2063,  1059, 27006,  7033,  2617,  6776,  2000,  5003,  2213,
         27006,  2072,  2373, 27838,  3520,  2011,  