# Hands-On NLP
## Project : Text Generation and Chatbot for Song Lyrics

<span style="color:magenta">Group Names:</span>

* Petko Petkov
* Manda ANDRIAMAROMANANA
* Ilyes SAIS

Install required libraries:

In [None]:
!pip install torch datasets pandas tqdm transformers trl

Import required libraries:

In [1]:
import torch
from datasets import load_dataset
import pandas as pd
import os
import json
from tqdm import tqdm
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForSeq2Seq, pipeline
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


Set Hugging Face token as environment variable:

In [15]:
os.environ["HF_TOKEN"] = "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" # "YOUR_TOKEN"

**1. Generate descriptions  (or the complete instructions that we're going to use for the training of our model) for the song lyrics in the dataset, we should use another more capable model like ChatGPT, Claude, Gemini, etc.** 

In [None]:
INPUT_DATASET_REPO = "vishnupriyavr/spotify-million-song-dataset"
OUTPUT_DATASET_REPO = "petkopetkov/spotify-million-song-dataset-descriptions"
HF_DATASET_SPLIT = "train"
CHECKPOINT_FILE = "songs_descriptions_checkpoint.json"

dataset = load_dataset(INPUT_DATASET_REPO, split=HF_DATASET_SPLIT)

if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        checkpoint_data = json.load(f)
else:
    checkpoint_data = {}

completed_indices = set(checkpoint_data.keys())

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

def generate_description(artist, song, lyrics):
    prompt = f"""Describe the following song based on the lyrics in a **comma-separated list** of adjectives and stylistic traits (can be more complex expressions or just simple words that a person would use to describe the song). 
    The description should include **mood, atmosphere, style, lyrical structure, and the artist's name**.
    
    Artist: {artist}
    Song: {song}
    Lyrics: {lyrics[:2000]}
    
    Description:"""

    try:
        messages = [
            {"role": "user", "content": prompt}
        ]

        outputs = pipe(messages, max_new_tokens=100, batch_size=32)
        
        return outputs[0]["generated_text"][-1]["content"].strip()
    except Exception as e:
        print(f"Error generating description: {e}")
        return None

requests_made = 0
start_time = time.time()

if "description" not in dataset.column_names:
    dataset = dataset.add_column("description", [""] * len(dataset))
    
new_descriptions = dataset["description"]

for i, desc in checkpoint_data.items():
    new_descriptions[int(i)] = desc

for i in tqdm(range(len(dataset)), desc="Generating descriptions"):
    if str(i) in completed_indices:
        continue

    artist = dataset[i]["artist"]
    song = dataset[i]["song"]
    lyrics = dataset[i]["text"]

    description = generate_description(artist, song, lyrics)
    
    if description:
        new_descriptions[i] = description
        checkpoint_data[str(i)] = description
        completed_indices.add(str(i))

    if i % 10 == 0:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(checkpoint_data, f)
            
dataset = dataset.remove_columns("description")
dataset = dataset.add_column("description", new_descriptions)

dataset.push_to_hub(OUTPUT_DATASET_REPO)

print("Dataset successfully updated and pushed to Hugging Face!")


# Dataset exploration

In [20]:
DATASET_NAME = "petkopetkov/spotify-million-song-dataset-descriptions"

dataset = load_dataset(DATASET_NAME, split="train")

dataset = pd.DataFrame(dataset)

In [21]:
pd.DataFrame(dataset)

Unnamed: 0,artist,song,link,text,description
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...","romantic, nostalgic, upbeat, sentimental, warm..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...","romantic, gentle, soothing, tender, intimate, ..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,"upbeat, optimistic, reflective, romantic, live..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"upbeat, joyful, playful, optimistic, catchy, r..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"uplifting, cheerful, energetic, catchy, playfu..."
...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...,"Uplifting, hopeful, nostalgic, spiritual, regg..."
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...,"protestant, angry, socially conscious, rhyt..."
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...,"Dreamy, yearning, desperate, hopeful, romantic..."
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...,"Melancholic, introspective, ethereal, atmosphe..."


# Finetuning models on the dataset

In [None]:
MODEL_NAMES = ["Qwen/Qwen2.5-0.5B", "HuggingFaceTB/SmolLM2-135M", "HuggingFaceTB/SmolLM2-135M-Instruct", "HuggingFaceTB/SmolLM2-360M", "HuggingFaceTB/SmolLM2-360M-Instruct"]

split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_data = split_dataset["train"]
val_data = split_dataset["test"]

# Define tokenizer function
def tokenize_function(examples):
    input_texts = [
        f"Generate song lyrics based on the description: {desc}\nSong lyrics: {text}"
        for desc, text in zip(examples["description"], examples["text"])
    ]

    tokenized_inputs = tokenizer(
        input_texts,
        padding="max_length",
        truncation=True,
        max_length=256
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

use_bf16 = torch.cuda.is_bf16_supported()

for model_name in MODEL_NAMES:
    print(f"=== Training model: {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_train = train_data.map(
        tokenize_function,
        batched=True,
        remove_columns=["description", "text"]
    )
    tokenized_val = val_data.map(
        tokenize_function,
        batched=True,
        remove_columns=["description", "text"]
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
    )
    
    new_model_name = f"{model_name.replace('/', '_')}-song-lyrics-generation"

    training_args = TrainingArguments(
        output_dir=new_model_name,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        learning_rate=5e-4,
        num_train_epochs=3,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=2,
        push_to_hub=True,
        fp16=not use_bf16,
        bf16=use_bf16,
        report_to="wandb",
        optim="adamw_torch",
        lr_scheduler_type="cosine",
        seed=3407,
        warmup_steps=30,
        eval_strategy="steps",
        logging_dir='./logs',
        eval_steps=100,
        do_eval=True,
        weight_decay=0.1,
        save_steps=200,
        hub_model_id=new_model_name,
    )

    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        padding=True
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        args=training_args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.push_to_hub()

# Models comparison