In [1]:
!pip uninstall -y datasets
!pip uninstall -y pandas

Found existing installation: datasets 2.1.0
Uninstalling datasets-2.1.0:
  Successfully uninstalled datasets-2.1.0
Found existing installation: pandas 2.0.2
Uninstalling pandas-2.0.2:
  Successfully uninstalled pandas-2.0.2


In [None]:
!pip install GPUtil
!pip install wandb
!pip install pandas==1.5.3
!pip install transformers
#!pip install datasets ==2.11 for load_datasets
!pip install datasets==2.10
!pip install optuna/sigopt/wandb/ray[tune] 
!pip install evaluate
!pip install git+https://github.com/google-research/bleurt.git
!pip install bert-score

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import os
import torch
import wandb
import math
from typing import List, Dict
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from GPUtil import showUtilization as gpu_usage
from datasets import load_dataset, concatenate_datasets
from transformers import (AutoTokenizer, AutoModelForCausalLM, Trainer,
                          TrainingArguments,DataCollatorForLanguageModeling, pipeline,
                          EarlyStoppingCallback)




### Load and prepare data

In [2]:
df = pd.read_csv("/kaggle/input/processed-taylor-tilted/processed_df_titled.csv")

In [3]:
ds = load_dataset("csv", data_files="/kaggle/input/processed-taylor-tilted/processed_df_titled.csv", split = "train")

In [None]:
ds

In [None]:
print(f"Train dataset size: {len(ds)}")

In [None]:
print(f"TRAINING SAMPLE: \n{ds['titled_lyrics_with_custom_eos'][0]}")

In [None]:
# define model
MODEL="gpt2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# tokenize dataset
tokenizer.add_special_tokens({"eos_token": '<END>'})
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenized_dataset = ds.map(lambda x: tokenizer(x["titled_lyrics_with_custom_eos"] , truncation = True, return_special_tokens_mask=True), batched=True, remove_columns =["Tracks","Album_ID", "Album", "Album_Path"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_dataset.format

In [None]:
# split the dataset
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [None]:
tokenizer.encode("<END>")

In [None]:
tokenized_dataset

### Model fine-tuning

In [None]:
# instantiate the model
def model_init():
    model = AutoModelForCausalLM.from_pretrained(MODEL)
    model.resize_token_embeddings(len(tokenizer))
    return model

In [None]:
#define wandb variables for logging
wandb.login()
os.environ["WANDB_PROJECT"] = "song-generator" # log to your project 
%env WANDB_LOG_MODEL=true

In [None]:
# define training arguments
# src: https://huggingface.co/docs/transformers/v4.33.3/en/main_classes/trainer#transformers.TrainingArguments

training_args = TrainingArguments(
    output_dir="/kaggle/working/finetuned_gpt2",
    evaluation_strategy="steps",
    save_strategy = "steps",
    eval_steps = 250,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_strategy = "epoch",
    num_train_epochs = 10,
    per_device_train_batch_size = 4,
    optim = "adamw_torch",
    report_to="wandb",
    fp16 = True,
    group_by_length = True,
    metric_for_best_model='eval_loss',
    run_name = "baseline_gpt2_finetune",
    greater_is_better = False,
    load_best_model_at_end = True
)

In [None]:
# train GPT2
# src: https://huggingface.co/docs/transformers/main_classes/trainer

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator = data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)],
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

In [None]:
trainer.train()

In [None]:
# evaluate the model with perplexity

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
# save model
trainer.save_model()

In [None]:
# save tokenizer
tokenizer.save_pretrained("/kaggle/working/finetuned_gpt2")

In [None]:
# save model into kaggle local dir
!zip -r baseline_gpt2.zip /kaggle/working/finetuned_gpt2

### Song Generation

In [11]:
# load model
# download model artifact from wandb

PROJECT_ID = 'jbarata1998/song-generator/model-baseline_gpt2_finetune:v5'

run = wandb.init()
artifact = run.use_artifact(PROJECT_ID, type='model')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Downloading large artifact model-baseline_gpt2_finetune:v5, 474.75MB. 4 files... 
[34m[1mwandb[0m:   4 of 4 files downloaded.  
Done. 0:0:1.2


In [18]:
MODEL_ID_TEXT_GEN = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID_TEXT_GEN)

In [19]:
tokenizer.save_pretrained(artifact_dir)

('./artifacts/model-baseline_gpt2_finetune:v5/tokenizer_config.json',
 './artifacts/model-baseline_gpt2_finetune:v5/special_tokens_map.json',
 './artifacts/model-baseline_gpt2_finetune:v5/vocab.json',
 './artifacts/model-baseline_gpt2_finetune:v5/merges.txt',
 './artifacts/model-baseline_gpt2_finetune:v5/added_tokens.json',
 './artifacts/model-baseline_gpt2_finetune:v5/tokenizer.json')

In [20]:
# test with 3 verses from Olivia Rodrigo (similar artist)
test_prompt = "Generate a song and its title:Well, good for you, I guess you moved on really easily\nYou found a new girl and it only took a couple weeks\nRemember when you said that you wanted to give me the world?"

In [21]:
# baseline model
generator = pipeline('text-generation', model= MODEL_ID_TEXT_GEN, device="cuda:0")

# Generate text and show results
result = generator(test_prompt, top_k=5, max_new_tokens = 400)

print(result[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate a song and its title:Well, good for you, I guess you moved on really easily
You found a new girl and it only took a couple weeks
Remember when you said that you wanted to give me the world?
I was thinking about it, so I thought about how you're gonna make me feel like a princess
But you didn't know I wanted to do that. You're gonna make me feel like you're a princess
I'm not gonna let you do that. I'm gonna do it for you.
You were gonna give me all the money, and I'm gonna make you feel better
So I was like, "I'm going to take care of this."
I was like, "I don't need that. I don't even know what to do with you right now. You don't even know what you're gonna do. I'm going to be your princess. And you're gonna be my queen. I'm going to be my queen. And you're going to be your queen."
I'm not even going to let you do that, you know.
You're gonna be like, "I'm going to do it. It's not gonna take any more than that." You can't take care of it. You can't take care of me.
And you ca

In [22]:
# finetuned model
generator = pipeline('text-generation', model= artifact_dir, device="cuda:0")

# Generate text and show results
result = generator(test_prompt, top_p=0.7, do_sample = True,  max_new_tokens = 1024 - len(tokenizer.encode(test_prompt)), eos_token_id = tokenizer.encode("<END"))

print(result[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:27 for open-end generation.


Generate a song and its title:Well, good for you, I guess you moved on really easily
You found a new girl and it only took a couple weeks
Remember when you said that you wanted to give me the world?
And how you said, "I'd love to have you"
Well, you know, I have had a couple times, yeah, it's been good

[Verse 1]
I'm in the car with my girlfriend, she's on the phone with her boyfriend
I'm in the passenger seat, the conversation starts
'Cause you're a little late for school
And she says, "I don't wanna see you like this"
And she says, "I think it's kinda frightenin' time"
I say, "Well, I'm just gonna stay in this car all alone"

[Chorus]
'Cause it's frightenin' time, it's frightenin' time
It's frightenin' time, it's frightenin' time
I'm scared to walk alone
I'm scared to talk with my friends
I'm scared to run away
I'm scared to run away
[Verse 2]
We had a long weekend, we had a good time
And you said that you want to spend the weekend with me
And I said, "I'd love to have you"
But you m

### Song Generator Evaluation

In [23]:
# get songs from related styles artists
df_ref_lyrics = pd.read_csv("/kaggle/input/genius-song-lyrics-with-language-information/song_lyrics.csv", nrows=10000, engine = "c")
df_ref_lyrics

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...
9995,Died 4 U,rap,Bizzy Bone,2004,865,{},[Hook]\nWish I would a died for you baby\nI wo...,10450,en,en,en
9996,Not Afraid,rap,Bizzy Bone,2004,708,{},Yeah\nStudio rap productions (this is how we r...,10451,en,en,en
9997,Sit Back Relax,rap,Bizzy Bone,2004,1861,{},[Verse 1]\nHey it's the Martin and Malcolm\nAn...,10452,en,en,en
9998,What Have I Learned,rap,Bizzy Bone,2008,1902,{},"Yeah\nSome people wonder, ya know?\nThey say w...",10453,en,en,en


In [24]:
df_ref_lyrics = df_ref_lyrics[df_ref_lyrics["tag"] == "pop"]

In [25]:
df_ref_lyrics = df_ref_lyrics.reset_index()

In [26]:
df_ref_lyrics

Unnamed: 0,index,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,239,Wordy Rappinghood,pop,Tom Tom Club,1981,26499,{},[Chorus]\nWhat are words worth?\nWhat are word...,242,en,en,en
1,389,Horchata,pop,Vampire Weekend,2009,102550,{},"[Verse 1]\nIn December, drinking horchata\nI'd...",384,en,en,en
2,516,Heartless,pop,Kanye West,2008,1175109,{},"[Chorus]\nIn the night, I hear 'em talk\nThe c...",526,en,en,en
3,557,Flashing Lights,pop,Kanye West,2007,1078113,{Dwele},[Intro: Connie Mitchell]\nFlashing lights (Lig...,523,en,en,en
4,588,Baby,pop,Justin Bieber,2010,2232442,{Ludacris},[Produced by The-Dream and Tricky Stewart]\n\n...,566,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...,...
137,5913,Kiss N Tell,pop,Kesha,2010,12566,{},"[Verse 1]\nListen to yourself, you're a hot me...",25994,en,en,en
138,5962,They Dont Care About Us,pop,Michael Jackson,1995,993938,{},[Intro]\nAll I want to say is that they don't ...,6224,en,en,en
139,6012,Where Them Girls At,pop,David Guetta,2011,143423,"{""Nicki Minaj"",""Flo Rida""}","[Chorus: Flo Rida]\nSo many girls in here, whe...",6280,en,en,en
140,6127,Party Rock Anthem,pop,LMFAO,2011,247834,"{""Lauren Bennett"",Listenbee}",[Intro: Redfoo]\nParty Rock\nYeah\nWoo!\nLet's...,6399,en,en,en


In [27]:
from evaluate import load
METRICS = {"BERT":"bertscore","BLEURT":"bleurt"}

In [38]:
class SongEvaluator:
    """
    Evaluates a song based on given metrics
    """
    def __init__(self, gen_model: str, metrics: Dict[str,str]):
        self.gen_model = gen_model
        self.metrics = metrics
        
    def gen_text(self, ref_text: str, **kwargs: dict):
        self.ref_text = ref_text
        try:
            generator = pipeline('text-generation', model= artifact_dir, device="cuda:0")
            self.prompt = "Generate a song and its title:" + "\n".join(self.ref_text.split("\n")[:4])
            result = generator(self.prompt,**kwargs, max_new_tokens = 1024 - len(tokenizer.encode(self.prompt)), eos_token_id = tokenizer.encode("<END"))
            self.generated = result[0]["generated_text"]
            print(self.generated)
            
        except Exception as e:
            print(f"Exception {e} occurred")
    
    def evaluate(self, **kwargs: dict):
        try:
            predictions = [self.generated.replace("Generate a song and its title:",'')]
            references = [self.ref_text]
            metric = load(self.metrics[kwargs["metric"]], module_type="metric")
            if kwargs["metric"] == "BERT":
                results = metric.compute(predictions=predictions, references=references,model_type="distilbert-base-uncased")
            else:
                results = metric.compute(predictions=predictions, references=references)
            print(results)
        except Exception as e:
            print(f"Exception {e} occurred")

In [39]:
song_evaluator = SongEvaluator(gen_model = artifact_dir, metrics = METRICS)

In [40]:
ref_prompt = df_ref_lyrics.loc[56, "lyrics"]
song_evaluator.gen_text(ref_text = ref_prompt)

Setting `pad_token_id` to `eos_token_id`:27 for open-end generation.


Generate a song and its title:[Part I: What Goes Around...]

[Verse 1: Justin Timberlake]
Hey, girl
Nice as a firework show
You throw rocks at my car and all it compresses into one piece
'Cause it's summer so I won't stay in that cold
And you said, "Look, we should have stayed the whole summer"
Well, you did
(I thought)

[Chorus: Justin Timberlake, 2, Taylor Swift & Lana Del Rey]
This love we had last July
[Verse 2: Taylor Swift]
Hey, can I ask you a question?
Did you ever have one kiss with someone bigger than you?
Or one kiss with someone who could touch you every day?
Hey, baby
You might also like[Chorus: Taylor Swift & Lana Del Rey]
This love we had last July

[Bridge: Justin Timberlake]
And we were so wrapped up
We needed some brand new space
But one time, something really upset
Fuckin' our love life went viral

[Chorus: Taylor Swift, 2]
This love we had last July

[Outro: Taylor Swift]
This love we had last July
Hey, baby
This love we had last July
This love we had last July

TIT

In [41]:
song_evaluator.evaluate(ref_text = ref_prompt,metric = "BLEURT")

{'scores': [-1.4021062850952148]}


In [42]:
song_evaluator.evaluate(ref_text = ref_prompt,metric = "BERT")

{'precision': [0.7956238985061646], 'recall': [0.7748055458068848], 'f1': [0.7850767374038696], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.33.0)'}


### Clean memory for cuda

In [None]:
# release memory
import gc

del Trainer
gc.collect()
torch.cuda.empty_cache()

In [25]:
# check GPU usage
gpu_usage() 

| ID | GPU | MEM  |
-------------------
|  0 |  0% | 100% |
