<a href="https://colab.research.google.com/github/MMBazel/LO_GenAI_Workshops/blob/main/HelloTaylorSwift_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

HellowWorld based on this tutorial:

*   https://www.kaggle.com/code/tommyadams/fine-tuning-tinyllama
*   Model: https://huggingface.co/huggingartists/taylor-swift
*   Dataset: https://huggingface.co/datasets/huggingartists/taylor-swift
*   https://www.youtube.com/watch?v=OVqe6GTrDFM


*   https://dev.to/_ken0x/tinyllama-llm-a-step-by-step-guide-to-implementing-the-11b-model-on-google-colab-1pjh
*   https://www.youtube.com/watch?v=6XeTk8cZUsM
*   https://github.com/uygarkurt/SFT-TinyLlama/tree/main









# Install necessary packages

In [1]:
!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install -i https://pypi.org/simple/ bitsandbytes -qqq
!pip install einops wandb -Uqqq

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.0/102.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.0 MB/

In [2]:
!pip freeze > requirements.txt

In [3]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import Dataset, DatasetDict

### Login With Credentials

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Data

### Load small data subset

In [29]:
from datasets import load_dataset

dataset = load_dataset("mmbazel/Taylor-Swift-Example")

In [30]:
dataset

DatasetDict({
    train: Dataset({
        features: ['album_name', 'track_title', 'track_n', 'lyric', 'line'],
        num_rows: 8358
    })
})

In [31]:
# Extracting the lyrics and meta-data from the dataset
train_data = dataset["train"]
lyrics = train_data["lyric"]
album_name = train_data["album_name"]
track_title = train_data["track_title"]
track_n = train_data["track_n"]
line = train_data["line"]

In [32]:
# Cleaning the lyrics
replace_with_space = ['\u2005', '\u200b', '\u205f', '\xa0', '-']
replace_letters = {'í':'i', 'é':'e', 'ï':'i', 'ó':'o', ';':',', ''':'\'', ''':'\'', ':':',', 'е':'e'}
remove_list = ['\)', '\(', '–','"','"', '"', '\[.*\]', '.*\|.*', '—']

In [33]:
cleaned_lyrics = []
for lyric in lyrics:
    cleaned_lyric = lyric
    for old, new in replace_letters.items():
        cleaned_lyric = cleaned_lyric.replace(old, new)
    for string in remove_list:
        cleaned_lyric = re.sub(string,'',cleaned_lyric)
    for string in replace_with_space:
        cleaned_lyric = re.sub(string,' ',cleaned_lyric)
    cleaned_lyrics.append(cleaned_lyric)

### Split train-test set

In [34]:
# Splitting the dataset into training, validation, and testing sets
train_percentage = 0.9
validation_percentage = 0.07
test_percentage = 0.03

In [35]:
# Calculate split indices
train_index = int(len(cleaned_lyrics) * train_percentage)
validation_index = int(len(cleaned_lyrics) * (train_percentage + validation_percentage))

In [36]:
# Splitting cleaned_lyrics into training, validation, and test sets
train_lyrics, validation_lyrics, test_lyrics = np.split(cleaned_lyrics, [train_index, validation_index])

In [37]:
# Assuming the similar splitting logic applies to album_name, track_title, track_n, and line
# If these arrays/lists are parallel to cleaned_lyrics, they can be split in the same manner
train_album_name, validation_album_name, test_album_name = np.split(album_name, [train_index, validation_index])
train_track_title, validation_track_title, test_track_title = np.split(track_title, [train_index, validation_index])
train_track_n, validation_track_n, test_track_n = np.split(track_n, [train_index, validation_index])
train_line, validation_line, test_line = np.split(line, [train_index, validation_index])

In [38]:
datasets = DatasetDict(
    {
        'train': Dataset.from_dict({
            'lyric': list(train_lyrics),
            'album_name': list(train_album_name),
            'track_title': list(train_track_title),
            'track_n': list(train_track_n),
            'line': list(train_line)
        }),
        'validation': Dataset.from_dict({
            'lyric': list(validation_lyrics),
            'album_name': list(validation_album_name),
            'track_title': list(validation_track_title),
            'track_n': list(validation_track_n),
            'line': list(validation_line)
        }),
        'test': Dataset.from_dict({
            'lyric': list(test_lyrics),
            'album_name': list(test_album_name),
            'track_title': list(test_track_title),
            'track_n': list(test_track_n),
            'line': list(test_line)
        })
    }
)

In [39]:
datasets

DatasetDict({
    train: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 7522
    })
    validation: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 585
    })
    test: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 251
    })
})

In [40]:
train_dataset = datasets['train']

# Create a new dataset with only the 'lyric' column for training
train_lyrics_dataset = Dataset.from_dict({'text': train_dataset['lyric']})

## Load Models 1

In [41]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch

# Loading the model
model_name = "PY007/TinyLlama-1.1B-step-50K-105b"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [42]:
# Creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)  # Move the model to the appropriate device

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [44]:
def generate_lyrics(query, model, tokenizer):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=250, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.3, eos_token_id=tokenizer.eos_token_id)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])

In [45]:
lyric_segment = ' '.join(test_lyrics[200:700])
generate_lyrics(lyric_segment, model, tokenizer)

INPUT
 Look at each other like that When the words of a sister come back in whispers That prove she was not in fact what she seemed Not a twin from your dreams She's a crook who was caught That old familiar body ache The snaps from the same little breaks in your soul You know when it's time to go Twenty years at your job Then the son of the boss gets the spot that was yours Or trying to stay for the kids When keeping it how it is will only break their hearts worse That old familiar body ache The snaps from the same little breaks in your soul You know when it's time to go Sometimes giving up is the strong thing Sometimes to run is the brave thing Sometimes walking out is the one thing That will find you the right thing Sometimes giving up is the strong thing Sometimes to run is the brave thing Sometimes walking out is the one thing That will find you the right thing Fifteen years, fifteen million tears Begging 'til my knees bled I gave it my all, he gave me nothing at all Then wondered 

In [46]:
# Preparing the model for low-rank adaptation (e.g., LoRA)
# Assuming prepare_model_for_kbit_training is compatible with your model architecture
# and does not specifically require a quantized model
prepared_model = prepare_model_for_kbit_training(model)

lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM")

# Assuming get_peft_model is designed to modify the model in-place or return a modified model
# and is compatible with the unquantized model architecture
peft_model = get_peft_model(prepared_model, peft_config)

In [47]:
# Setting training arguments

output_dir = "mmbazel/tinyllama_tayswifty" # Model repo on your hugging face account where you want to save your model
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3 # Sets limit for gradient clipping
max_steps = 200     # Number of training steps
warmup_ratio = 0.03 # Portion of steps used for learning_rate to warmup from 0
lr_scheduler_type = "cosine" # I chose cosine to avoid learning plateaus

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none'
)

In [48]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_lyrics_dataset,
    peft_config=peft_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

Map:   0%|          | 0/7522 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [49]:
trainer.train()


Step,Training Loss
10,4.7909
20,3.4515
30,3.4947
40,3.3161
50,3.2292
60,3.181
70,3.0821
80,3.2482
90,3.0432
100,3.2312


TrainOutput(global_step=200, training_loss=3.254313144683838, metrics={'train_runtime': 50.4848, 'train_samples_per_second': 23.77, 'train_steps_per_second': 3.962, 'total_flos': 103816598925312.0, 'train_loss': 3.254313144683838, 'epoch': 0.16})

In [55]:
# Access the 'test' split of the dataset
test_dataset = datasets['test']

# Create a new dataset with only the 'lyric' column for testing
test_lyrics_dataset = Dataset.from_dict({'text': test_dataset['lyric']})

# Extract a portion of the test lyrics dataset
test_data = test_lyrics_dataset[300:400]

# Extract the text data from test_data
test_lyrics = test_data['text']



In [56]:
lyric_segment = ' '.join(test_lyrics[50:200])
generate_lyrics(lyric_segment, model, tokenizer)

INPUT
  

OUTPUT
 And I'm not sure if you know this, but it was the first time we met and that's when everything changed for me.
And then there were times where I thought he looked like a little kid in his pajamas, or maybe even more than that? But now I see him as an adult who has grown up so fast! He is my new best friend, my new best friend, my new best friend! My new best friends are always with us! They say they love you back to your face again! They say they love you back to their faces once again! Their eyes light up just looking at you! Oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh oh
