In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
cd '/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data'

/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Valence

In [None]:
data_orig = pd.read_csv("tcc_ceds_music.csv")

X = list(data_orig['lyrics'])
y = list(data_orig['valence'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)
len(y_train)

22697

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
import torch

#  create Dataset Objects
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # ensure labels are float tensors
        return item

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## All Parameters
* tuning base model parameters and classification head parameters

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# specify that all parameters of the model are tuned:
for param in model.distilbert.parameters():
    param.requires_grad = True


# training arguments
training_args = TrainingArguments(
    output_dir='./FineTuning_DistilBERT/results',          # output directory
    num_train_epochs=1,                                    # total number of training epochs
    per_device_train_batch_size=8,                         # batch size per device during training
    per_device_eval_batch_size=16,                         # batch size for evaluation
    warmup_steps=50,                                       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                     # strength of weight decay
    logging_dir='./FineTuning_DistilBERT/logs',            # directory for storing logs
    logging_steps=10,
)


# Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=None                # No metrics required for regression
)

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.3623
20,0.0805
30,0.0669
40,0.0829
50,0.0661
60,0.0662
70,0.0535
80,0.0752
90,0.067
100,0.0826


TrainOutput(global_step=2838, training_loss=0.05970575394262471, metrics={'train_runtime': 1239.3151, 'train_samples_per_second': 18.314, 'train_steps_per_second': 2.29, 'total_flos': 3006558928628736.0, 'train_loss': 0.05970575394262471, 'epoch': 1.0})

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse
# 3 epochs

0.059497926

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 2 epochs

0.05446202

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 1 epochs

0.054207295

In [None]:
# save model
trainer.save_model("./FineTuning_DistilBERT/model_AllParams_Valence_1epochs")

## Regression Head

* tuning only regression head parameters

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# Freeze all layers except the classification head
for param in model.distilbert.parameters():
    param.requires_grad = False

# Ensure the classification head parameters are trainable
for param in model.classifier.parameters():
    param.requires_grad = True



# Define training arguments
training_args = TrainingArguments(
    output_dir='./FineTuning_DistilBERT/results',          # output directory
    num_train_epochs=4,                                    # total number of training epochs
    per_device_train_batch_size=8,                         # batch size per device during training
    per_device_eval_batch_size=16,                         # batch size for evaluation
    warmup_steps=50,                                       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                     # strength of weight decay
    logging_dir='./FineTuning_DistilBERT/logs',            # directory for storing logs
    logging_steps=10,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=None                # No metrics required for regression
)

trainer.train()
trainer.save_model("./FineTuning_DistilBERT/model_RegressionHead_Valence_4epochs")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.2878
20,0.1867
30,0.1086
40,0.0663
50,0.0747
60,0.0605
70,0.0614
80,0.0686
90,0.0733
100,0.0784


In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 1 epoch

0.057231616

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 2 epochs

0.056826822

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 3 epochs

0.05652605

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer
path_model_DistilBERT_RegressionHead_Arousal = "./FineTuning_DistilBERT/model_RegressionHead_Arousal_4epochs"
# model_DistilBERT_RegressionHead_Arousal = DistilBertForSequenceClassification.from_pretrained(path_model_DistilBERT_RegressionHead_Arousal)
# MSE_DistilBERT_finetuned_RegressionHead_Arousal = get_MSE(model_DistilBERT_RegressionHead_Arousal, test_dataset)


In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 4 epochs

0.060633723

# Arousal

In [None]:
data_orig = pd.read_csv("tcc_ceds_music.csv")

X = list(data_orig['lyrics'])
y = list(data_orig['energy'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)
len(y_train)

22697

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
import torch

#  Create Dataset Objects
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # ensure labels are float tensors
        return item

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## All Parameters

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# specify that all parameters of the model are tuned:
for param in model.distilbert.parameters():
    param.requires_grad = True


# Define training arguments
training_args = TrainingArguments(
    output_dir='./FineTuning_DistilBERT/results',          # output directory
    num_train_epochs=2,                                    # total number of training epochs
    per_device_train_batch_size=8,                         # batch size per device during training
    per_device_eval_batch_size=16,                         # batch size for evaluation
    warmup_steps=50,                                       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                     # strength of weight decay
    logging_dir='./FineTuning_DistilBERT/logs',            # directory for storing logs
    logging_steps=10,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=None                # No metrics required for regression
)

trainer.train()
trainer.save_model("./FineTuning_DistilBERT/model_AllParams_Arousal_2epochs")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.2898
20,0.0814
30,0.0769
40,0.0573
50,0.0615
60,0.0695
70,0.0645
80,0.0636
90,0.0616
100,0.0502


In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 1 epoch

0.041901615

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 2 epochs

0.042106908

## Regression Head


In [None]:
print(torch.cuda.is_available())  # Should print True if GPU is available

True


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# Freeze all layers except the classification head
for param in model.distilbert.parameters():
    param.requires_grad = False

# Ensure the classification head parameters are trainable
for param in model.classifier.parameters():
    param.requires_grad = True



# Define training arguments
training_args = TrainingArguments(
    output_dir='./FineTuning_DistilBERT/results',          # output directory
    num_train_epochs=5,                                    # total number of training epochs
    per_device_train_batch_size=8,                         # batch size per device during training
    per_device_eval_batch_size=16,                         # batch size for evaluation
    warmup_steps=50,                                       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                     # strength of weight decay
    logging_dir='./FineTuning_DistilBERT/logs',            # directory for storing logs
    logging_steps=10,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=None                # No metrics required for regression
)

trainer.train()

trainer.save_model("./FineTuning_DistilBERT/model_RegressionHead_Arousal_5epochs")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.3566
20,0.2739
30,0.1129
40,0.0704
50,0.059
60,0.0768
70,0.0591
80,0.058
90,0.0536
100,0.0526


In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 1 epoch

0.04561369

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 2 epochs

0.044987187

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 3 epochs

0.044609256

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 4 epochs

0.04434566

In [None]:
# MSE test
from sklearn.metrics import mean_squared_error

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions
true_labels = predictions.label_ids

mse = mean_squared_error(predicted_labels, true_labels)
mse

# 5 epochs

0.044174626