In [1]:
from utils import get_artist, get_biggest_arts
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoTokenizer
from torch import Tensor
import torch
MODEL_NAME = "distilbert-base-uncased"

def songs_from_artists(arts, tokenizer,label2id: dict, song_limit: int = 300, ):
    data = []
    for art in arts:
        for song in art.songs[:song_limit]:
            input_ids, attention_mask = transform_text(song.lyrics, tokenizer)
            for one_input, one_mask in zip(input_ids, attention_mask):
                data.append({
                    "label": label2id[song.artist_name],
                    "input_ids": one_input.numpy(),
                    "attention_mask": one_mask.numpy(),
                })
    return data

def chunks_from_artists(arts, tokenizer,label2id: dict, song_limit: int = 300, ):
    inputs = []
    attentions = []
    labels = []
    for art in arts:
        for song in art.songs[:song_limit]:
            input_ids, attention_mask = transform_text(song.lyrics, tokenizer)
            for one_input, one_mask in zip(input_ids, attention_mask):
                    inputs.append(one_input)
                    attentions.append(one_mask)
                    labels.append(label2id[song.artist_name])
    return inputs, attentions, labels

def tokenize(text, tokenizer: AutoTokenizer) -> tuple[Tensor, Tensor]:
    result = tokenizer(text, add_special_tokens=False, truncation=False, return_tensors='pt')
    return result["input_ids"][0], result["attention_mask"][0]

def split_overlapping(tensor: Tensor, chunk_size: int = 510, stride: int = 400, min_chunk_len = 100) -> list[Tensor]:
    chunks = [tensor[i:i+chunk_size] for i in range(0, tensor.shape[0], stride)]
    if len(chunks) > 1:
        chunks = [chunk for chunk in chunks if len(chunk) >= min_chunk_len]
    return chunks

def add_special_tokens(input_chunks: list[Tensor], mask_chunks: list[Tensor]):
    for i in range(len(input_chunks)):
        input_chunks[i] = torch.cat([torch.tensor([101]), input_chunks[i], torch.tensor([102])])
        mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])

def add_padding(input_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
    for i in range(len(input_chunks)):
        pad_len = 512 - input_chunks[i].shape[0]
        input_chunks[i] = torch.cat([input_chunks[i], torch.tensor([tokenizer.pad_token_id] * pad_len)])
        mask_chunks[i] = torch.cat([mask_chunks[i], torch.tensor([0] *pad_len)])
        
def stack_chunks(input_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
    return torch.stack(input_chunks).long(), torch.stack(mask_chunks).int()

def transform_text(
    text: str,
    tokenizer: AutoTokenizer,
    chunk_size: int = 510,
    stride: int = 400,
    min_chunk_len = 100,
    ):
    id_long, mask_long = tokenize(text, tokenizer)
    id_chunks = split_overlapping(id_long, chunk_size, stride, min_chunk_len)
    mask_chunks = split_overlapping(mask_long, chunk_size, stride, min_chunk_len)
    
    add_special_tokens(id_chunks, mask_chunks)
    add_padding(id_chunks, mask_chunks)
    input_ids, attention_mask = stack_chunks(id_chunks, mask_chunks)
    return input_ids, attention_mask

  from .autonotebook import tqdm as notebook_tqdm
2024-06-26 15:34:19.654822: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 15:34:19.683494: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
top10 = get_biggest_arts(10)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
id2label ={i:label for i, label in enumerate((art.name_sanitized for art in top10))}
label2id = {label:i for i, label in id2label.items()}
# split the data

data = songs_from_artists(top10, tokenizer, label2id)
df = pd.DataFrame(data)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

100%|██████████| 1049/1049 [00:04<00:00, 220.22it/s].71it/s]
sorting artists by lyrics length: 1049it [00:04, 219.92it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (722 > 512). Running this sequence through the model will result in indexing errors


In [3]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch.nn.functional as F
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=10, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.nll_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [6]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkatnak56[0m ([33mfirst_throw[0m). Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 20%|██        | 419/2095 [02:34<09:30,  2.94it/s]

{'eval_loss': -11.056400299072266, 'eval_accuracy': 0.14498806682577567, 'eval_runtime': 12.6295, 'eval_samples_per_second': 132.705, 'eval_steps_per_second': 8.314, 'epoch': 1.0}


 24%|██▍       | 500/2095 [03:03<09:12,  2.89it/s]  

{'loss': -6.236, 'grad_norm': 22.738264083862305, 'learning_rate': 1.5226730310262532e-05, 'epoch': 1.19}


                                                  
 40%|████      | 838/2095 [05:12<07:06,  2.95it/s]

{'eval_loss': -28.195188522338867, 'eval_accuracy': 0.13484486873508353, 'eval_runtime': 12.6004, 'eval_samples_per_second': 133.011, 'eval_steps_per_second': 8.333, 'epoch': 2.0}


 48%|████▊     | 1000/2095 [06:10<06:18,  2.89it/s] 

{'loss': -24.4335, 'grad_norm': 49.09849166870117, 'learning_rate': 1.045346062052506e-05, 'epoch': 2.39}


                                                   
 60%|██████    | 1257/2095 [07:51<04:43,  2.96it/s]

{'eval_loss': -45.8148307800293, 'eval_accuracy': 0.13484486873508353, 'eval_runtime': 12.5051, 'eval_samples_per_second': 134.025, 'eval_steps_per_second': 8.397, 'epoch': 3.0}


 72%|███████▏  | 1500/2095 [09:16<03:25,  2.89it/s]

{'loss': -45.2292, 'grad_norm': 57.98971176147461, 'learning_rate': 5.68019093078759e-06, 'epoch': 3.58}


                                                   
 80%|████████  | 1676/2095 [10:30<02:22,  2.93it/s]

{'eval_loss': -58.77919006347656, 'eval_accuracy': 0.13484486873508353, 'eval_runtime': 12.6798, 'eval_samples_per_second': 132.179, 'eval_steps_per_second': 8.281, 'epoch': 4.0}


 95%|█████████▌| 2000/2095 [12:24<00:33,  2.86it/s]

{'loss': -59.8031, 'grad_norm': 65.0184326171875, 'learning_rate': 9.069212410501194e-07, 'epoch': 4.77}


                                                   
100%|██████████| 2095/2095 [13:09<00:00,  2.98it/s]

{'eval_loss': -63.568782806396484, 'eval_accuracy': 0.13484486873508353, 'eval_runtime': 12.4826, 'eval_samples_per_second': 134.267, 'eval_steps_per_second': 8.412, 'epoch': 5.0}


100%|██████████| 2095/2095 [13:10<00:00,  2.65it/s]

{'train_runtime': 797.8193, 'train_samples_per_second': 42.002, 'train_steps_per_second': 2.626, 'train_loss': -35.26774558565781, 'epoch': 5.0}





TrainOutput(global_step=2095, training_loss=-35.26774558565781, metrics={'train_runtime': 797.8193, 'train_samples_per_second': 42.002, 'train_steps_per_second': 2.626, 'total_flos': 4439615832576000.0, 'train_loss': -35.26774558565781, 'epoch': 5.0})

In [7]:
test_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask', '__index_level_0__'],
    num_rows: 1676
})

In [8]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask', '__index_level_0__'],
    num_rows: 6702
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
input_ids, attention_masks = transform_text(example_song_lyrics[0], tokenizer)

NameError: name 'example_song_lyrics' is not defined

In [None]:
input_ids.shape, attention_masks.shape

(torch.Size([3, 512]), torch.Size([3, 512]))

In [None]:
for song in get_artist("Tede").songs[:100]:
    print(song.title)
    input_ids, attention_masks = transform_text(song.get_clean_song_lyrics(), tokenizer)
    print(input_ids.shape, attention_masks.shape)

#hot16challenge
torch.Size([2, 512]) torch.Size([2, 512])
Wyje Wyje Bane
torch.Size([3, 512]) torch.Size([3, 512])
Rainman
torch.Size([3, 512]) torch.Size([3, 512])
Michael Kors
torch.Size([2, 512]) torch.Size([2, 512])
#CTZK
torch.Size([2, 512]) torch.Size([2, 512])
Wunder-Baum
torch.Size([2, 512]) torch.Size([2, 512])
Ostatnia Noc
torch.Size([2, 512]) torch.Size([2, 512])
Pażałsta
torch.Size([2, 512]) torch.Size([2, 512])
Biełyje Nosy
torch.Size([3, 512]) torch.Size([3, 512])
T-Killa
torch.Size([3, 512]) torch.Size([3, 512])
Forever Ja
torch.Size([3, 512]) torch.Size([3, 512])
Brodaggacio
torch.Size([2, 512]) torch.Size([2, 512])
69 Ziomeczków
torch.Size([4, 512]) torch.Size([4, 512])
#COHF
torch.Size([2, 512]) torch.Size([2, 512])
Kot Gigant
torch.Size([2, 512]) torch.Size([2, 512])
Tough Love
torch.Size([3, 512]) torch.Size([3, 512])
CMRT
torch.Size([2, 512]) torch.Size([2, 512])
Feat. (+ Introdukcja)
torch.Size([3, 512]) torch.Size([3, 512])
Drin za drinem
torch.Size([3, 512]) tor

In [None]:
from utils import get_biggest_arts



In [None]:
top30_arts = get_biggest_arts(30)
songs = [song for art in top30_arts for song in art.songs[:200] if song.get_clean_song_lyrics() != ""]

In [None]:
testing = chunk_text(example_song_lyrics[0], tokenizer)[2]

In [None]:
testing == tokenizer.decode(tokenizer.encode(testing, add_special_tokens=False, truncation=False, return_tensors='pt')[0])

False

In [None]:
testing_encoded = tokenizer.encode(testing, add_special_tokens=False, truncation=False, return_tensors='pt')
testing_decoded = tokenizer.decode(testing_encoded[0])
print(testing_encoded)
print(testing_decoded)
print(testing)

tensor([[ 1001,  1001, 12098,  6200,  6583,  2480,  4213, 24098,  2666,  8962,
          6633, 17491,  3217,  3676, 27838,  9761,  5004,  1010,  2000,  6448,
          4355,  2401,  1039,  4143,  6342, 14855,  2243, 29250,  2480,  2532,
          1062, 13476,  6200,  2278,  1012,  1012,  1012,  1012, 13970, 14756,
          7367, 11968,  2063,  5207,  5004, 21469,  2050,  1055,  2480,  9739,
          2226, 12849, 23344,  2102,  8034,  2022,  2480,  2933,  2226,  8945,
          5003,  2213, 24185, 19666,  2100,  5353, 14855,  2243,  2793,  2050,
         17235,  1052, 22123,  6305,  6583, 12170, 11283,  1039,  9096, 24185,
          2094,  3489,  1039,  9096,  1105, 17994,  2063,  1029,  2000,  2026,
         24185, 13728,  2100,  1105, 17994,  2063,  1105, 17994,  2050, 14768,
          1010,  5939,  1010, 22064,  3501,  1055, 18818, 17994,  6305,  6187,
         18818,  2063,  1059, 27006,  7033,  2617,  6776,  2000,  5003,  2213,
         27006,  2072,  2373, 27838,  3520,  2011,  