In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

### Prepare the Dataset

In [11]:
!unzip archive.zip

Archive:  archive.zip
  inflating: 01-taylor_swift.csv     
  inflating: 02-fearless_taylors_version.csv  
  inflating: 03-speak_now_deluxe_package.csv  
  inflating: 04-red_deluxe_edition.csv  
  inflating: 05-1989_deluxe.csv      
  inflating: 06-reputation.csv       
  inflating: 07-lover.csv            
  inflating: 08-folklore_deluxe_version.csv  
  inflating: 09-evermore_deluxe_version.csv  


In [16]:
# Importing the dataset
import glob
import pandas as pd
path = '.'
csv_files = glob.glob(path + "/*.csv")
df_list = (pd.read_csv(i) for i in csv_files)
df = pd.concat(df_list, ignore_index=True)
lyrics = '\n'.join(df.loc[:,'lyric'])
print(lyrics[:200])

Knew he was a killer first time that I saw him
Wondered how many girls he had loved and left haunted
But if he's a ghost, then I can be a phantom
Holdin' him for ransom, some
Some boys are tryin' too 


In [17]:
# List of all unique characters
print(' '.join(sorted(set(lyrics))))


   ! " & ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z |   é í ï ó е   ​ – — ‘ ’ ” …  


In [19]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import Dataset

In [20]:

# Cleaning the file by removing/replacing unnecessary characters and removing sections
# that are not lyrics
replace_with_space = ['\u2005', '\u200b', '\u205f', '\xa0', '-']
replace_letters = {'í':'i', 'é':'e', 'ï':'i', 'ó':'o', ';':',', '‘':'\'', '’':'\'', ':':',', 'е':'e'}
remove_list = ['\)', '\(', '–','"','”', '"', '\[.*\]', '.*\|.*', '—']

cleaned_lyrics = lyrics

for old, new in replace_letters.items():
    cleaned_lyrics = cleaned_lyrics.replace(old, new)
for string in remove_list:
    cleaned_lyrics = re.sub(string,'',cleaned_lyrics)
for string in replace_with_space:
    cleaned_lyrics = re.sub(string,' ',cleaned_lyrics)
print(''.join(sorted(set(cleaned_lyrics))))


 !',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxyz…


In [21]:
# Setting aside a portion for training the model and a portion for testing the data to prevent
# the model from overfitting to the data it is tested on
split_point = int(len(cleaned_lyrics)*0.95)
train_data = cleaned_lyrics[:split_point]
test_data = cleaned_lyrics[split_point:]
train_data_seg = []
for i in range(0, len(train_data), 500):
        text = train_data[i:min(i+500, len(train_data))]
        train_data_seg.append(text)
train_data_seg = Dataset.from_dict({'text':train_data_seg})
print(len(train_data_seg))

557


In [None]:
# Loading the model with double quantization
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

In [None]:
# Creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [24]:
# Generating lyrics with the base model. The repetition penalty in the generation config prevents the model from continually repeating the same string.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def generate_lyrics(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=250, pad_token_id = tokenizer.eos_token_id,repetition_penalty=1.3, eos_token_id = tokenizer.eos_token_id)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])
generate_lyrics(test_data[200:700], model)

INPUT
 urself, talk to the tears
Talk to the man who put you here
And don't wait for the sky to clear
I'll leave my window open
'Cause I'm too tired at night to call your name, oh
Just know I'm right here hoping
That you'll come in with the rain
I've watched you so long, screamed your name
I don't know what else I can say
I'll leave my window open
'Cause I'm too tired at night for all these games
Just know I'm right here hoping
That you'll come in with the rain
I could go back to every laugh
But I don' 

OUTPUT
 t want that anymore
So let me just be a little sadder tonight
Let it sink into my heart and mind
For now, I won't cry out loud
Because there are other things worth feeling


In [25]:
# Setting arguments for low-rank adaptation

model = prepare_model_for_kbit_training(model)

lora_alpha = 32 # The weight matrix is scaled by lora_alpha/lora_rank, so I set lora_alpha = lora_rank to remove scaling
lora_dropout = 0.05
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM")

peft_model = get_peft_model(model, peft_config)

In [26]:
# Setting training arguments

output_dir = "epsil/tinyllama_songs" # Model repo on your hugging face account where you want to save your model
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3 # Sets limit for gradient clipping
max_steps = 200     # Number of training steps
warmup_ratio = 0.03 # Portion of steps used for learning_rate to warmup from 0
lr_scheduler_type = "cosine" # I chose cosine to avoid learning plateaus

In [27]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none'
)

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


In [None]:
!huggingface-cli login

In [None]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_data_seg,
    peft_config=peft_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

In [None]:
trainer.train()

In [34]:
# Generating lyrics with fine-tuned model
generate_lyrics(test_data[200:700], model)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


INPUT
 urself, talk to the tears
Talk to the man who put you here
And don't wait for the sky to clear
I'll leave my window open
'Cause I'm too tired at night to call your name, oh
Just know I'm right here hoping
That you'll come in with the rain
I've watched you so long, screamed your name
I don't know what else I can say
I'll leave my window open
'Cause I'm too tired at night for all these games
Just know I'm right here hoping
That you'll come in with the rain
I could go back to every laugh
But I don' 

OUTPUT
 t think it matters now
You left me a note on the kitchen table
Said that we were both just kids when this started
So please tell me why?
Please tell me how much of this is real
How many times have I said sorry already
Why do people always fall into love like they did us?
It was such an easy thing to forget
We had everything and nothing seemed new or fresh
Now there are days where I wake up feeling empty
Feelings that no one ever felt before
No matter how hard I try
To find some

References :

https://github.com/mlabonne/llm-course/blob/main/Fine_tune_Llama_2_in_Google_Colab.ipynb

https://huggingface.co/TinyLlama

https://www.kaggle.com/code/tommyadams/fine-tuning-tinyllama