In [None]:
!pip uninstall -y datasets
!pip uninstall -y pandas

In [None]:
!pip install GPUtil
!pip install wandb
!pip install pandas==1.5.3
!pip install transformers
!pip install datasets==2.11
!pip install optuna/sigopt/wandb/ray[tune] 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import torch
import wandb
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from GPUtil import showUtilization as gpu_usage
from datasets import load_dataset, concatenate_datasets
from transformers import (AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
                          DataCollatorForLanguageModeling, pipeline, EarlyStoppingCallback)

### Load and prepare data

In [None]:
df = pd.read_csv("/kaggle/input/processed-taylor-tilted/processed_df_titled.csv")

In [None]:
ds = load_dataset("csv", data_files="/kaggle/input/processed-taylor-tilted/processed_df_titled.csv", split = "train")

In [None]:
ds

In [None]:
print(f"Train dataset size: {len(ds)}")

In [None]:
print(f"TRAINING SAMPLE: \n{ds['titled_lyrics'][0]}")

In [None]:
# define model
MODEL="gpt2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# test tokenizer
tokenizer(ds["titled_lyrics"][0])

In [None]:
# tokenize dataset

tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = ds.map(lambda x: tokenizer(x["titled_lyrics"], truncation = True, padding = True), batched=True, remove_columns =["Tracks","Album_ID", "Album", "Album_Path"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_dataset.format

In [None]:
# split the dataset
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset["train"]["input_ids"]

### Model fine-tuning

In [None]:
# instantiate the model
def model_init():
    return AutoModelForCausalLM.from_pretrained(MODEL)

In [None]:
#define wandb variables for logging
wandb.login()
os.environ["WANDB_PROJECT"] = "song-generator" # log to your project 
%env WANDB_LOG_MODEL=true

In [None]:
# check GPU usage
gpu_usage()  

In [None]:
torch.cuda.empty_cache()

In [None]:
# define training arguments
# src: https://huggingface.co/docs/transformers/v4.33.3/en/main_classes/trainer#transformers.TrainingArguments

training_args = TrainingArguments(
    output_dir="/kaggle/working/finetuned_gpt2",
    evaluation_strategy="steps",
    save_strategy = "steps",
    eval_steps = 250,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_strategy = "epoch",
    num_train_epochs = 10,
    per_device_train_batch_size = 4,
    optim = "adamw_torch",
    report_to="wandb",
    fp16 = True,
    metric_for_best_model='eval_loss',
    run_name = "baseline_gpt2_finetune",
    greater_is_better = False,
    load_best_model_at_end = True
)

In [None]:
# train GPT2
# src: https://huggingface.co/docs/transformers/main_classes/trainer

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator = data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)],
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

In [None]:
trainer.train()

In [None]:
# evaluate the model with perplexity

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
# save model
trainer.save_model()

In [None]:
# save tokenizer aswell
tokenizer.save_pretrained("/kaggle/working/finetuned_gpt2")

In [None]:
# save model into kaggle local dir
!zip -r baseline_gpt2.zip /kaggle/working/finetuned_gpt2

In [None]:
# test with 3 verses from Olivia Rodrigo (similar artist)
test_prompt = "Generate a song and its title:Well, good for you, I guess you moved on really easily\nYou found a new girl and it only took a couple weeks\nRemember when you said that you wanted to give me the world?"

In [None]:
# baseline model
generator = pipeline('text-generation', model= MODEL, device="cuda:0")

# Generate text and show results
result = generator(test_prompt, top_k=5, max_new_tokens = 400)

print(result[0]["generated_text"])

In [None]:
# finetuned model
#Load model and move to GPU
generator = pipeline('text-generation', model="/kaggle/working/finetuned_gpt2", device="cuda:0")

#Generate text and show results
result = generator(test_prompt,top_k=5,max_new_tokens = 700)

print(result[0]["generated_text"])