In [2]:
from datasets import load_dataset, Dataset
MODEL_NAME = "distilbert/distilbert-base-uncased"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer
from torch import Tensor
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from utils import get_biggest_arts
top30 = get_biggest_arts(30)

def split_overlapping(tensor: Tensor, chunk_size: int = 510, stride: int = 400, min_chunk_len = 100) -> list[Tensor]:
    chunks = [tensor[i:i+chunk_size] for i in range(0, tensor.shape[0], stride)]
    if len(chunks) > 1:
        chunks = [chunk for chunk in chunks if len(chunk) >= min_chunk_len]
    return chunks

def chunk_text(text: str, tokenizer: AutoTokenizer) -> list[str]:
    big_tensor = tokenizer.encode(text, add_special_tokens=False, truncation=False, return_tensors='pt')
    chunks = split_overlapping(big_tensor[0])
    # reverse to text
    chunks = [tokenizer.decode(chunk) for chunk in chunks]
    return chunks

2024-06-26 15:29:25.297465: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 15:29:25.327855: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 1049/1049 [00:05<00:00, 205.30it/s].19it/s]
sorting artists by lyrics length: 1049it [00:05, 205.05it/s]


In [4]:
from sklearn.model_selection import train_test_split
data = []
for art in top30:
    for song in art.songs[:180]:
        chunks = chunk_text(song.get_clean_song_lyrics(), tokenizer)
        for chunk in chunks:
            data.append({'text': chunk, 'label': art.name})
import pandas as pd
df = pd.DataFrame(data)
label2id = {label: i for i, label in enumerate(df['label'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['label'].map(label2id)
train_df, test_df = train_test_split(df, test_size=0.2)
test_dataset, train_dataset = Dataset.from_pandas(test_df), Dataset.from_pandas(train_df)

Token indices sequence length is longer than the specified maximum sequence length for this model (669 > 512). Running this sequence through the model will result in indexing errors


In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
test_dataset = test_dataset.map(preprocess_function, batched=True)
train_dataset = train_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 2933/2933 [00:00<00:00, 9279.77 examples/s]
Map: 100%|██████████| 11732/11732 [00:01<00:00, 10291.22 examples/s]


In [49]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [50]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [40]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label2id.keys()), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 11732
})

In [52]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  0%|          | 0/1468 [05:15<?, ?it/s]          

{'loss': 3.2882, 'grad_norm': 6.095839977264404, 'learning_rate': 1.3188010899182562e-05, 'epoch': 0.68}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

  0%|          | 0/1468 [06:58<?, ?it/s]         
[A
[A

{'eval_loss': 3.0080056190490723, 'eval_accuracy': 0.15888169110126152, 'eval_runtime': 21.846, 'eval_samples_per_second': 134.258, 'eval_steps_per_second': 8.423, 'epoch': 1.0}



  0%|          | 0/1468 [08:31<?, ?it/s]           

{'loss': 2.9997, 'grad_norm': 5.926347255706787, 'learning_rate': 6.376021798365123e-06, 'epoch': 1.36}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/1468 [11:38<?, ?it/s]         
[A
[A

{'eval_loss': 2.8014159202575684, 'eval_accuracy': 0.2158199795431299, 'eval_runtime': 22.2218, 'eval_samples_per_second': 131.987, 'eval_steps_per_second': 8.28, 'epoch': 2.0}



100%|██████████| 1468/1468 [09:16<00:00,  2.64it/s]

{'train_runtime': 556.8385, 'train_samples_per_second': 42.138, 'train_steps_per_second': 2.636, 'train_loss': 3.044351354281974, 'epoch': 2.0}





TrainOutput(global_step=1468, training_loss=3.044351354281974, metrics={'train_runtime': 556.8385, 'train_samples_per_second': 42.138, 'train_steps_per_second': 2.636, 'total_flos': 3109767098941440.0, 'train_loss': 3.044351354281974, 'epoch': 2.0})

In [53]:
trainer.evaluate(eval_dataset=test_dataset)

100%|██████████| 184/184 [00:21<00:00,  8.63it/s]


{'eval_loss': 2.8014159202575684,
 'eval_accuracy': 0.2158199795431299,
 'eval_runtime': 21.4659,
 'eval_samples_per_second': 136.635,
 'eval_steps_per_second': 8.572,
 'epoch': 2.0}