# Import dependencies

In [1]:
# General
import numpy as np

# Dataset related
import pandas as pd
from datasets import Dataset

# Model related
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer

# Metrics
import evaluate

# Prepare Dataset

Import the dataset

In [2]:
# Load dataset
df = pd.read_csv("./data/spotify_songs.csv")

# List the column names
list(df.columns)

['track_id',
 'track_name',
 'track_artist',
 'lyrics',
 'track_popularity',
 'track_album_id',
 'track_album_name',
 'track_album_release_date',
 'playlist_name',
 'playlist_id',
 'playlist_genre',
 'playlist_subgenre',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms',
 'language']

Remove unwanted columns & rows

In [3]:
# Remove all unwanted rows, keeping only english songs
df = df[df['language'] == 'en']

# Remove all unwanted columns, keeping only lyrics and valence
df = df[['lyrics', 'valence']]

# Check filtered dataset
df.head()

Unnamed: 0,lyrics,valence
1,"The trees, are singing in the wind The sky blu...",0.404
2,"NA Yeah, Spyderman and Freeze in full effect U...",0.65
3,I really can't stay Baby it's cold outside I'v...,0.405
4,Get up out of my business You don't keep me fr...,0.24
5,"Hold your breath, don't look down, keep trying...",0.305


Convert valence scores into sentiment labels (high valence = positive sentiment = 1)

In [4]:
df["label"] = df["valence"].apply(lambda x: 1 if x > 0.5 else 0)
df = df[['lyrics', 'label']]
df.head()

Unnamed: 0,lyrics,label
1,"The trees, are singing in the wind The sky blu...",0
2,"NA Yeah, Spyderman and Freeze in full effect U...",1
3,I really can't stay Baby it's cold outside I'v...,0
4,Get up out of my business You don't keep me fr...,0
5,"Hold your breath, don't look down, keep trying...",0


Prepare dataset for Hugging Face pipeline

In [5]:
# Convert to Hugging Face Dataset format
df.reset_index(drop=True, inplace=True)
dataset = Dataset.from_pandas(df)

# Split into training & validation sets (80-20 split)
dataset = dataset.train_test_split(test_size=0.2)
train_data, val_data = dataset["train"], dataset["test"]

# Check hugging face dataset format
train_data, val_data

(Dataset({
     features: ['lyrics', 'label'],
     num_rows: 12324
 }),
 Dataset({
     features: ['lyrics', 'label'],
     num_rows: 3081
 }))

# Load Pretrained Sentiment Analysis Model from Hugging Face

Load pretrained sentiment analysis model and tokeniser

In [6]:
MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Perform tokenisation on the dataset

In [7]:
# Define tokenisation function
def tokenize_function(example):
    return tokenizer(example["lyrics"], padding="max_length", truncation=True)

In [8]:
# Tokenize the dataset
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Remove original lyrics column
train_data = train_data.remove_columns(["lyrics"])
val_data = val_data.remove_columns(["lyrics"])

# Check new dataset format
train_data, val_data

Map:   0%|          | 0/12324 [00:00<?, ? examples/s]

Map:   0%|          | 0/3081 [00:00<?, ? examples/s]

(Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 12324
 }),
 Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 3081
 }))

# Training the model

Specify the training arguments

In [9]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_spotify_sentiment",  
    evaluation_strategy="epoch",  
    save_strategy="epoch",  
    learning_rate=2e-5,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    num_train_epochs=3,  
    weight_decay=0.01,  
)



Specify evaluation metrics

In [10]:
# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Define compute prediction and calculate metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Train the model

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6428,0.65779,0.608569


# Evaluate the model

In [None]:
# Evaluate on eval set
results = trainer.evaluate()
print(results)