<a href="https://colab.research.google.com/github/MMBazel/LO_GenAI_Workshops/blob/main/HelloTaylorSwift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

HellowWorld based on this tutorial:

*   https://www.kaggle.com/code/tommyadams/fine-tuning-tinyllama
*   Model: https://huggingface.co/huggingartists/taylor-swift
*   Dataset: https://huggingface.co/datasets/huggingartists/taylor-swift
*   https://www.youtube.com/watch?v=OVqe6GTrDFM






# Install necessary packages

In [1]:
!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install -i https://pypi.org/simple/ bitsandbytes -qqq
!pip install einops wandb -Uqqq

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.0/102.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.6 MB/s

In [2]:
!pip freeze > requirements.txt

In [3]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import Dataset, DatasetDict

### Login With Credentials

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Data

### Load small data subset

In [5]:
from datasets import load_dataset

dataset = load_dataset("mmbazel/Taylor-Swift-Example")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.3k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['album_name', 'track_title', 'track_n', 'lyric', 'line'],
        num_rows: 8358
    })
})

In [7]:
# Extracting the lyrics and meta-data from the dataset
train_data = dataset["train"]
lyrics = train_data["lyric"]
album_name = train_data["album_name"]
track_title = train_data["track_title"]
track_n = train_data["track_n"]
line = train_data["line"]

In [8]:
# Cleaning the lyrics
replace_with_space = ['\u2005', '\u200b', '\u205f', '\xa0', '-']
replace_letters = {'í':'i', 'é':'e', 'ï':'i', 'ó':'o', ';':',', ''':'\'', ''':'\'', ':':',', 'е':'e'}
remove_list = ['\)', '\(', '–','"','"', '"', '\[.*\]', '.*\|.*', '—']

In [9]:
cleaned_lyrics = []
for lyric in lyrics:
    cleaned_lyric = lyric
    for old, new in replace_letters.items():
        cleaned_lyric = cleaned_lyric.replace(old, new)
    for string in remove_list:
        cleaned_lyric = re.sub(string,'',cleaned_lyric)
    for string in replace_with_space:
        cleaned_lyric = re.sub(string,' ',cleaned_lyric)
    cleaned_lyrics.append(cleaned_lyric)

### Split train-test set

In [10]:
# Splitting the dataset into training, validation, and testing sets
train_percentage = 0.9
validation_percentage = 0.07
test_percentage = 0.03

In [11]:
# Calculate split indices
train_index = int(len(cleaned_lyrics) * train_percentage)
validation_index = int(len(cleaned_lyrics) * (train_percentage + validation_percentage))

In [12]:
# Splitting cleaned_lyrics into training, validation, and test sets
train_lyrics, validation_lyrics, test_lyrics = np.split(cleaned_lyrics, [train_index, validation_index])

In [13]:
# Assuming the similar splitting logic applies to album_name, track_title, track_n, and line
# If these arrays/lists are parallel to cleaned_lyrics, they can be split in the same manner
train_album_name, validation_album_name, test_album_name = np.split(album_name, [train_index, validation_index])
train_track_title, validation_track_title, test_track_title = np.split(track_title, [train_index, validation_index])
train_track_n, validation_track_n, test_track_n = np.split(track_n, [train_index, validation_index])
train_line, validation_line, test_line = np.split(line, [train_index, validation_index])

In [54]:
datasets = DatasetDict(
    {
        'train': Dataset.from_dict({
            'lyric': list(train_lyrics),
            'album_name': list(train_album_name),
            'track_title': list(train_track_title),
            'track_n': list(train_track_n),
            'line': list(train_line)
        }),
        'validation': Dataset.from_dict({
            'lyric': list(validation_lyrics),
            'album_name': list(validation_album_name),
            'track_title': list(validation_track_title),
            'track_n': list(validation_track_n),
            'line': list(validation_line)
        }),
        'test': Dataset.from_dict({
            'lyric': list(test_lyrics),
            'album_name': list(test_album_name),
            'track_title': list(test_track_title),
            'track_n': list(test_track_n),
            'line': list(test_line)
        })
    }
)

In [55]:
datasets

DatasetDict({
    train: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 7522
    })
    validation: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 585
    })
    test: Dataset({
        features: ['lyric', 'album_name', 'track_title', 'track_n', 'line'],
        num_rows: 251
    })
})

In [59]:
train_dataset = datasets['train']

# Create a new dataset with only the 'lyric' column for training
train_lyrics_dataset = Dataset.from_dict({'text': train_dataset['lyric']})

## Load Models 1

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch

# Loading the model
model_name = "PY007/TinyLlama-1.1B-step-50K-105b"
model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [18]:
# Creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)  # Move the model to the appropriate device

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [20]:
def generate_lyrics(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=250, pad_token_id = tokenizer.eos_token_id,repetition_penalty=1.3, eos_token_id = tokenizer.eos_token_id)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])


In [21]:
# Assuming test_lyrics is defined elsewhere and is a list of words
lyric_segment = ' '.join(test_lyrics[200:700])

generate_lyrics(lyric_segment, model)

INPUT
 Look at each other like that When the words of a sister come back in whispers That prove she was not in fact what she seemed Not a twin from your dreams She's a crook who was caught That old familiar body ache The snaps from the same little breaks in your soul You know when it's time to go Twenty years at your job Then the son of the boss gets the spot that was yours Or trying to stay for the kids When keeping it how it is will only break their hearts worse That old familiar body ache The snaps from the same little breaks in your soul You know when it's time to go Sometimes giving up is the strong thing Sometimes to run is the brave thing Sometimes walking out is the one thing That will find you the right thing Sometimes giving up is the strong thing Sometimes to run is the brave thing Sometimes walking out is the one thing That will find you the right thing Fifteen years, fifteen million tears Begging 'til my knees bled I gave it my all, he gave me nothing at all Then wondered 

In [22]:
# Preparing the model for low-rank adaptation (e.g., LoRA)
# Assuming prepare_model_for_kbit_training is compatible with your model architecture
# and does not specifically require a quantized model
prepared_model = prepare_model_for_kbit_training(model)

lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM")

# Assuming get_peft_model is designed to modify the model in-place or return a modified model
# and is compatible with the unquantized model architecture
peft_model = get_peft_model(prepared_model, peft_config)

In [23]:
# Setting training arguments

output_dir = "mmbazel/tinyllama_tayswifty" # Model repo on your hugging face account where you want to save your model
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3 # Sets limit for gradient clipping
max_steps = 200     # Number of training steps
warmup_ratio = 0.03 # Portion of steps used for learning_rate to warmup from 0
lr_scheduler_type = "cosine" # I chose cosine to avoid learning plateaus

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none'
)

In [60]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_lyrics_dataset,
    peft_config=peft_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

Map:   0%|          | 0/7522 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [61]:
trainer.train()


Step,Training Loss
10,4.824
20,3.3502
30,3.6264
40,3.8171
50,3.5637
60,3.17
70,3.1159
80,3.2664
90,3.0475
100,3.1706


TrainOutput(global_step=200, training_loss=3.298898162841797, metrics={'train_runtime': 48.8561, 'train_samples_per_second': 24.562, 'train_steps_per_second': 4.094, 'total_flos': 103816598925312.0, 'train_loss': 3.298898162841797, 'epoch': 0.16})

In [66]:
# Access the 'test' split of the dataset
test_dataset = datasets['test']

# Create a new dataset with only the 'lyric' column for testing
test_lyrics_dataset = Dataset.from_dict({'text': test_dataset['lyric']})

# Extract a portion of the test lyrics dataset
test_data = test_lyrics_dataset[200:700]

# Extract the text data from test_data
test_lyrics = test_data['text']

# Call the generate_lyrics function with the test lyrics and model
generate_lyrics(test_lyrics, model)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).