# Hands-On NLP
## Project : Text Generation and Chatbot for Song Lyrics

<span style="color:magenta">Group Names:</span>

* Petko Petkov
* Manda ANDRIAMAROMANANA
* Ilyes SAIS

**1. Generate descriptions  (or the complete instructions that we're going to use for the training of our model) for the song lyrics in the dataset, we should use another more capable model like ChatGPT, Claude, Gemini, etc.** 

In [None]:
import torch
from datasets import load_dataset
import os
import json
from tqdm import tqdm
import time
from transformers import pipeline

INPUT_DATASET_REPO = "vishnupriyavr/spotify-million-song-dataset"
OUTPUT_DATASET_REPO = "petkopetkov/spotify-million-song-dataset-descriptions"
HF_DATASET_SPLIT = "train"
CHECKPOINT_FILE = "songs_descriptions_checkpoint.json"

dataset = load_dataset(INPUT_DATASET_REPO, split=HF_DATASET_SPLIT)

if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        checkpoint_data = json.load(f)
else:
    checkpoint_data = {}

completed_indices = set(checkpoint_data.keys())

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

def generate_description(artist, song, lyrics):
    prompt = f"""Describe the following song based on the lyrics in a **comma-separated list** of adjectives and stylistic traits (can be more complex expressions or just simple words that a person would use to describe the song). 
    The description should include **mood, atmosphere, style, lyrical structure, and the artist's name**.
    
    Artist: {artist}
    Song: {song}
    Lyrics: {lyrics[:2000]}
    
    Description:"""

    try:
        messages = [
            {"role": "user", "content": prompt}
        ]

        outputs = pipe(messages, max_new_tokens=100, batch_size=32)
        
        return outputs[0]["generated_text"][-1]["content"].strip()
    except Exception as e:
        print(f"Error generating description: {e}")
        return None

requests_made = 0
start_time = time.time()

if "description" not in dataset.column_names:
    dataset = dataset.add_column("description", [""] * len(dataset))
    
new_descriptions = dataset["description"]

for i, desc in checkpoint_data.items():
    new_descriptions[int(i)] = desc

for i in tqdm(range(len(dataset)), desc="Generating descriptions"):
    if str(i) in completed_indices:
        continue

    artist = dataset[i]["artist"]
    song = dataset[i]["song"]
    lyrics = dataset[i]["text"]

    description = generate_description(artist, song, lyrics)
    
    if description:
        new_descriptions[i] = description
        checkpoint_data[str(i)] = description
        completed_indices.add(str(i))

    if i % 10 == 0:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(checkpoint_data, f)
            
dataset = dataset.remove_columns("description")
dataset = dataset.add_column("description", new_descriptions)

dataset.push_to_hub(OUTPUT_DATASET_REPO)

print("Dataset successfully updated and pushed to Hugging Face Hub!")


## Load Data : 

In [1]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_parquet("hf://datasets/petkopetkov/spotify-million-song-dataset-descriptions/data/train-00000-of-00001.parquet")

In [2]:
df.head()

Unnamed: 0,artist,song,link,text,description
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...","romantic, nostalgic, upbeat, sentimental, warm..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...","romantic, gentle, soothing, tender, intimate, ..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,"upbeat, optimistic, reflective, romantic, live..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"upbeat, joyful, playful, optimistic, catchy, r..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"uplifting, cheerful, energetic, catchy, playfu..."


# Finetuning a first model : __SmolLM__

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer
from peft import LoraConfig

# Model and dataset
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"  # Use small variant
DATASET_NAME = "petkopetkov/spotify-million-song-dataset-descriptions"

# Load dataset
dataset = load_dataset(DATASET_NAME)
train_data = dataset["train"]

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)

# Define LoRA configuration (for efficient finetuning)
lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

# Tokenization function
def tokenize_function(examples):
    input_texts = [f"Prompt: {desc}\nOutput: {text}" for desc, text in zip(examples["description"], examples["text"])]
    # Tokenize inputs
    tokenized_inputs = tokenizer(input_texts, padding="max_length", truncation=True, max_length=256)
    # Shift labels to the right by one position for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Preprocess dataset
tokenized_datasets = train_data.map(tokenize_function, batched=True, remove_columns=["description", "text"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./smollm-spotify-ft",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    push_to_hub=True,
    fp16=torch.cuda.is_available(),
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets,
    args=training_args,
    peft_config=lora_config,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save final model
trainer.push_to_hub()
tokenizer.save_pretrained("./smollm-spotify-ft")