<a href="https://colab.research.google.com/github/NiclasFenton-Wiegleb/schlager-lyrics-bot/blob/main/Basic_Schlager_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Training Basic Model*

*Prepare Dataset*

In [None]:
!pip install "transformers==4.3" "datasets" "peft" "accelerate" "bitsandbytes" "trl" "safetensors" "huggingface_hub"  --upgrade

In [6]:
!pip install transformers==4.3
!pip install datasets
!pip install peft
!pip install accelerate
!pip install bitsandbytes
!pip install trl
!pip install safetensors
!pip install huggingface_hub

In [7]:
#Check CUDA is running on GPU and bf16 supported (required)

import torch
print(torch.cuda.is_available(), torch.cuda.is_bf16_supported())

True False


In [8]:
#Import dependencies

from random import randrange
import pandas as pd
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

In [None]:
df_lyrics = pd.read_csv('schlager_songs_v2.csv')

df_lyrics['lyrics'][10]

In [None]:
#Clean and prepare data to be turned into dataset

context=[]

instruction=[]

for ind in df_lyrics["lyrics"].index:
  lyrics = df_lyrics["lyrics"].iloc[ind]
  verse_1 = df_lyrics["verse_1"].iloc[ind]
  if type(verse_1) != str:
    continue
  if lyrics == '{"status"false,"reason""Unexpected error occurred (no quota cost) Please try again later"}':
    continue
  else:
    try:
      #Instruction
      instruction.append(verse_1)

      #Context
      context.append(lyrics)
    except:
      continue

df_dataset = pd.DataFrame(columns=['instruction', 'context'])

df_dataset['instruction'] = instruction
df_dataset['context'] = context

df_dataset.head(15)

In [None]:
train_df = df_dataset[:946]
test_df = df_dataset[946:]

In [None]:
datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
    })

datasets_train_test

*Train Model*

In [9]:
# Import dependencies

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, default_data_collator, get_linear_schedule_with_warmup
from trl import SFTTrainer
import accelerate
from torch.utils.data import DataLoader
from tqdm import tqdm
import os
#SFTTrainer supports a native integration with peft, which makes it easy to instruction tune LLMs

from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType



In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#Parameters
device = "cuda"
model_id = "malteos/bloom-1b5-clp-german"

peft_config = PromptTuningConfig(
            task_type="CAUSAL_LM",
            prompt_tuning_init=PromptTuningInit.TEXT,
            num_virtual_tokens=8,
            prompt_tuning_init_text="Benuzte den gegebenen Input um ein Schlager Lied zu schreiben.",
            tokenizer_name_or_path=model_id
)

dataset_name = "schlager_lyrics"
checkpoint_name = f"{dataset_name}_{model_id}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "input"
label_column = "output"
max_length = 300 #optimize
lr = 6e-2
num_epochs = 30
batch_size = 3

Preprocess dataset

In [None]:
# Tokenize dataset

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
#Preprocess Dataset

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{x}" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(datasets_train_test["train"][0:10])

In [None]:
processed_datasets = datasets_train_test.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=datasets_train_test["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

In [None]:
# Create DataLoaders

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]


train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True)

Train the Model

In [None]:
#Load base model

model = AutoModelForCausalLM.from_pretrained(model_id)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

In [None]:
#Set optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
#Maximising the space available for model loading on GPU
os.environ['PYTORCH_CUDA_ALLOC_CONF']="max_split_size_mb:1024"

In [None]:
#Train the model

model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True))

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

*Merge and share model to huggingface hub*

In [13]:
#Load base model
model_id = "malteos/bloom-1b5-clp-german"
peft_model_id = "niclasfw/bloom-1b5-schlager-bot-004"
# model.push_to_hub(peft_model_id, use_auth_token=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

(…)-1b5-clp-german/resolve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

(…)erman/resolve/main/tokenizer_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

(…)m-1b5-clp-german/resolve/main/vocab.json:   0%|          | 0.00/850k [00:00<?, ?B/s]

(…)m-1b5-clp-german/resolve/main/merges.txt:   0%|          | 0.00/508k [00:00<?, ?B/s]

(…)5-clp-german/resolve/main/tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

(…)man/resolve/main/special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [15]:
from peft import AutoPeftModelForCausalLM

In [None]:
#Load adapter layer, merge model and push to huggingface hub

#Accelerate Model with Text Generation Inference
model = AutoPeftModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=peft_model_id,
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

# push merged model to the hub
merged_model.push_to_hub(peft_model_id)
tokenizer.push_to_hub(peft_model_id)

(…)bot-004/resolve/main/adapter_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

(…)eo-hessianai-7b/resolve/main/config.json:   0%|          | 0.00/708 [00:00<?, ?B/s]

(…)esolve/main/pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)i-7b/resolve/main/generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131k [00:00<?, ?B/s]

*Inference*

In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = "niclasfw/bloom-1b5-schlager-bot-004"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
#Test the model

model = model.to(device)
sample = f"""Ich weiss ein Geheimniss!\n
Dieter Bohlen hat keine Haare, sondern ein Toupee!\n
Wir trinken auf Dieter Bohlens Toupee\n
"""

prompt = f"""### Instruction:
          Benuzte den gegebenen Input um ein Schlager Lied zu schreiben.

### Input:
{sample}

"""

for n in range(5):
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
        # with torch.inference_mode():
        outputs = model.generate(input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=300, do_sample=True, top_p=0.9,temperature=0.9)

        print(f"Version {n+1}")
        print(f"Prompt:\n{sample}\n")
        print(f"Generated output:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")