In [None]:
!pip install transformers[torch] -U
!pip install accelerate -U
!pip install datasets
!pip install evaluate
# from google.colab import files
# uploaded = files.upload()

Collecting transformers[torch]
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train_file_path = "/content/drive/MyDrive/NLP/FinalProject/train.csv" #FILE PATH CHANGE IF NEEDED
val_file_path = "/content/drive/MyDrive/NLP/FinalProject/val.csv" #FILE PATH IF NEEDED

Mounted at /content/drive


In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load as load_metric
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score


In [None]:
# Load datasets
train_data = pd.read_csv(train_file_path)
val_data = pd.read_csv(val_file_path)

# Encode genres into numeric labels
label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["Genre0"])
val_data["label"] = label_encoder.transform(val_data["Genre0"])

# Prepare datasets for Huggingface
train_dataset = Dataset.from_pandas(train_data[["Lyric", "label"]])
val_dataset = Dataset.from_pandas(val_data[["Lyric", "label"]])

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})
print(dict(enumerate(label_encoder.classes_)))


{0: 'Dance', 1: 'Heavy Metal', 2: 'Hip Hop', 3: 'Indie', 4: 'Pop', 5: 'Rock'}


In [None]:
# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the lyrics
def preprocess_function(examples):
    return tokenizer(examples["Lyric"], truncation=True, padding=True, max_length=512)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load pre-trained model with classification head
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/48493 [00:00<?, ? examples/s]

Map:   0%|          | 0/5389 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/FinalProj/Model_Test_Final/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9594,0.96925,0.62776,0.574264


Evaluation results: {'eval_loss': 0.969249963760376, 'eval_accuracy': 0.6277602523659306, 'eval_f1': 0.5742639390503201, 'eval_runtime': 82.4323, 'eval_samples_per_second': 65.375, 'eval_steps_per_second': 4.088, 'epoch': 1.0}


In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/genre_classification_model")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/genre_classification_model")

('/content/drive/MyDrive/NLP/genre_classification_model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/genre_classification_model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/genre_classification_model/vocab.txt',
 '/content/drive/MyDrive/NLP/genre_classification_model/added_tokens.json',
 '/content/drive/MyDrive/NLP/genre_classification_model/tokenizer.json')

In [None]:
checkpoint_path = "/content/drive/MyDrive/NLP/FinalProj/Model_Test_Final/checkpoint-3031"

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/FinalProj/Model_Test_Final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # Ensure the total number of epochs is at least 2
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Reload your dataset (if not already loaded)
dataset = DatasetDict({
    "train": tokenized_datasets["train"],
    "validation": tokenized_datasets["validation"]
})

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer
)

# Resume training from the specified checkpoint
trainer.train(resume_from_checkpoint=checkpoint_path)

# Evaluate the model after completing the second epoch
results = trainer.evaluate()
print("Evaluation results:", results)

  trainer = Trainer(
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
2,0.8315,0.934036


KeyError: "The `metric_for_best_model` training argument is set to 'eval_accuracy', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [None]:
results = trainer.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss
2,0.8315,0.934036


Evaluation results: {'eval_loss': 0.9340356588363647}


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,  # Use the trained model
    args=training_args,  # Use the same training arguments
    eval_dataset=dataset["validation"],  # Validation dataset
    tokenizer=tokenizer,  # Tokenizer
    compute_metrics=compute_metrics  # Add compute_metrics
)
results = trainer.evaluate()
print("Evaluation results:", results)


  trainer = Trainer(


Evaluation results: {'eval_loss': 0.9340356588363647, 'eval_model_preparation_time': 0.0016, 'eval_accuracy': 0.6405641120801633, 'eval_f1': 0.6076551710945605, 'eval_runtime': 83.4827, 'eval_samples_per_second': 64.552, 'eval_steps_per_second': 4.037}
