## Features Allowed

1. Get the lyrics from a song after giving its name 
2. Return the song list from an album name
3. Get a match with a song for a specific query/request
4. Get all the albums a single song is conteined
5. Classify mood of a song
6. Get all the albumns that have a song simmilar to the request
7. Show all the songs simmilar to the one from the query (take into consideration remixed or TVs)

# Imports

In [1]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.vectorstores import Chroma
import gradio as gr
import os
import re
import json

##### GENAI
from dotenv import load_dotenv
import google.generativeai as gen

## LLM for the selfquery
from langchain_google_genai import ChatGoogleGenerativeAI

#### separate eng words without spacing
import wordninja
from IPython.display import Markdown, display

#Matching between str
from rapidfuzz import fuzz


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)
  from .autonotebook import tqdm as notebook_tqdm


# Enviroment / LLM

In [2]:
def get_google_api_key():
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    if not GOOGLE_API_KEY:
        raise ValueError("Missing GOOGLE_API_KEY in .env") 
    return GOOGLE_API_KEY

api_key = get_google_api_key()
gen.configure(api_key=api_key)


def get_llm(model_name="gemini-1.5-flash"):

    api_key = get_google_api_key() 

    return ChatGoogleGenerativeAI(
        model=model_name,
        google_api_key=api_key,
        temperature=0.7,
    )


# Pre-processing Functions

In [3]:
def normalize_lyrics(text):
    text = re.sub(r'\[.*?post-chorus.*?\]', '[Post-Chorus]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?chorus.*?\]', '[Chorus]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?verse.*?\]', '[Verse]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?bridge.*?\]', '[Bridge]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?into.*?\]', '[Intro]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?interlude.*?\]', '[Interlude]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?outro.*?\]', '[Outro]', text, flags=re.IGNORECASE)
    return text

def clean_text(text):
    text = re.sub(r'[\u2000-\u200A\u202F\u205F\u3000]', ' ', text)
    text = normalize_lyrics(text)
    # text  = re.sub(r"\\'", "", text)
    return text

def remove_symbols(s):
    return re.sub(r"[-?_,.]", "", s)

def split_by_capitals(s):
    # Creates keyword list with the words
    s = remove_symbols(s)
    n_cap_letters = len(s)
    keywords = [i.lower() for i in re.findall(r'[A-Z][^A-Z]*', s)]
    if (len(keywords) == 1):
        return keywords
    else:
        return [i.lower() for i in wordninja.split(s)]

def space_song_names(s):
    #Joins the cleaned keywords
    #Convert LavanderHaze -> Lavander Haze , Anti-Hero -> anti hero
    return " ".join(i for i in split_by_capitals(s))



# Load Chroma DB and json album_songs_summary

In [4]:
def get_collection():
    model_name = "sentence-transformers/all-mpnet-base-v2"
    huggingface_embedding = HuggingFaceEmbeddings(model_name=model_name)
    vectordb = Chroma(
    persist_directory="chroma_db",
    collection_name="taylor_songs_collection",
    embedding_function=huggingface_embedding
    )
    return vectordb

vectordb = get_collection()

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [5]:
with open('album_songs_summary', 'r') as f:
    album_songs_summary = json.load(f)

#  Functios Tools for generating content

## Stored Info

In [6]:
def get_database_info():
    return album_songs_summary

## get_complete_lyrics

In [7]:
def get_complete_lyrics(song_name : str, album_name: str = "", threshold : int = 87) -> dict:
    """ returns a dictionary with the most correlated songs matching the song_name requested"""
    matches, song_album = get_best_match_name(song_name, threshold)
    
    songs = {}
    #print(f'The best matches for the requested song "{song_name}" are: ')
    for song, album in song_album.items():
        #print(f'song: "{space_song_names(song_name)}" from album : "{space_song_names(album)}"')
        results = vectordb._collection.get(
            where={
                "$and": [
                    {"song": {"$eq": song}},
                    {"album": {"$eq": album}}
                ]
            }
        )
        songs[song] = results['documents']
    return songs
#get_complete_lyrics('all to well 10 mn')

## get_album_songs

In [8]:
def get_album_songs(album_name: str, threshold : int = 90):
    matches_albums = []
    album_tracks = {}
    for stored_album in album_songs_summary.keys():
        stored_album_spaced = space_song_names(stored_album)
        score = fuzz.partial_ratio(stored_album_spaced, space_song_names(album_name))
        if score >= threshold:
            matches_albums.append(stored_album)

    n = len(album_name.split(' '))
    matches_albums = list(set(matches_albums))
    for album in matches_albums:
        # print(n, split_by_capitals(album), space_song_names(album_name).lower())
        if (n == 1) & (space_song_names(album_name).lower() in split_by_capitals(album)):
            album_tracks[space_song_names(album)] = [space_song_names(i) for i in album_songs_summary[album]]
        elif (n > 1):
            album_tracks[space_song_names(album)] = [space_song_names(i) for i in album_songs_summary[album]]
            

    return album_tracks

# album = 'midnights 3am'
# tracks = get_album_songs_v2(album, 85)
# print(tracks.keys())


## get_song_match

In [9]:
def get_song_match(query : str, n_results : int = 10):
     """Finds the most relevant lyrics song based on the query."""
    #  retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": n_results, "filter": {"album": "Lover"} })
     retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": n_results })

     docs = retriever.invoke(query)    
     print(docs)

     songs_info = {}
     for doc in docs:
         song_text = doc.page_content
         song_name = doc.metadata["song"]
         album_name = doc.metadata["album"]
         if song_name in songs_info:
             #print(f"Duplicate song found: {song_name}")
            #  print(len(songs_info[song_name]["lyrics"]))
             songs_info[song_name]["lyrics"] += "\n" + song_text
            #  print(len(songs_info[song_name]["lyrics"]))
             continue
         songs_info[song_name] = {
             "song_name": song_name,
             "lyrics": song_text,
             "album" : album_name
         }
     return songs_info

# songs_info = get_song_match(query="I want a song from albumMidnights to cry", n_results=10)
# songs_info

## query_collection

In [10]:
def query_collection(matadata_label, value, return_lyrics=False):
    metadata = {matadata_label: value}
    results = vectordb._collection.get(where=metadata)
    if return_lyrics:
        return results['documents']
    else:
        return results
    
#query_collection("album", "Lover", True)

## classify_mood

In [11]:
def classify_mood(query: str) -> str:

    system_prompt = (
    "You are an intelligent query router for a chatbot. "
    "Your task is to classify user queries or song text based on the sentiment or mood expressed. "
    "Possible categories include: 'sad', 'happy', 'melancholy', 'dance', 'revenge', 'mad', 'weepy', 'depressed', 'charmed', 'joyful', 'celebrate', 'in love', 'angry', 'joy', 'gratitude', 'serenity', 'anxiety', 'resentment', 'despair'.\n\n"
    "Use the following rules:\n"
    "1. Analyze the emotional tone of the text (user query or lyrics).\n"
    "2. Select only **one or more** mood label that best captures the primaries emotional intents.\n"
    "3. If the text expresses multiple emotions, choose the most dominant or consistent ones.\n"
    "4. Use your understanding of natural language and human emotions to infer implicit mood where it's not obvious.\n"
    "5. Respond with the label only — do not include explanations or extra commentary.\n\n"
    "Examples:\n"
    "Input: 'Why did you leave me? Everything reminds me of you.' → Output: sad, angry, melancholy \n"
    "Input: 'I just met someone new and I can’t stop smiling!' → Output: in love, happy, joyful \n"
    "Input: 'This beat makes me want to dance all night!' → Output: dance\n"
    "Input: 'We’re gonna burn it all down, no mercy!' → Output: angry\n"
    "Input: 'I won, and they all doubted me.' → Output: revenge, resentment\n"
    "Input: 'Just got a promotion, let’s celebrate!' → Output: celebrate, dance, joyful \n"
    "Input: 'Walking alone in the rain, thinking of old times.' → Output: melancholy, sad, anxiety, despair \n"
)

    model = gen.GenerativeModel(
                model_name="gemini-2.0-flash-001",
                system_instruction=system_prompt,
                tools=[]
            )
    chat = model.start_chat(history=[], enable_automatic_function_calling=True)

    response = chat.send_message(query)
    return response.text

#classify_mood('I love revenge')

## get_best_match_name

In [12]:
def get_best_match_name(name : str, threshold : int = 87) -> list:
    song_name = space_song_names(name)
    matches=[]
    song_album={}
    for album, stored_songs in album_songs_summary.items():
        for stored_song in stored_songs:
            stored_song_spaced = space_song_names(stored_song)
            score = fuzz.partial_ratio(song_name, stored_song_spaced)
            if score >= threshold:
                matches.append(stored_song)
                song_album[stored_song] = album
    return matches, song_album
#print(get_best_match_name('style'))
#matches, song_album = get_best_match_name('red')

# Create Chat

## system_instructions

In [13]:
system_instructions = """
You are a smart assistant that helps users explore and understand Taylor Swift’s songs and albums using a local database.
You have access to tools that let you retrieve lyrics, match songs to moods, or identify albums. Choose the most relevant tool to satisfy the user query.



You have access to the following functions:

1. get_complete_lyrics(song_name : str, album_name: str = "", threshold : int = 87):
   - Use when the user asks for the lyrics of a specific song, can include the album name. Returns a dictionary with the possible song matches and their lyrics text.

2. get_album_songs(album_name):
   - Use when the user wants to know the list of songs from a specific album.

3. get_song_match(query, n_results=10):
   - Use to recommend songs based on a user's emotion, feeling, or situation.
   - Also use this to interpret the meaning of a query and find matching songs or lyrics.

4. query_collection(metadata_label, value, return_lyrics=False):
   - Use when the user wants to find out what album a song belongs to or to filter songs based on metadata.

5. classify_mood(query):
   - Use to classify the emotional mood of a query and which song can match. Useful when a user is expressing feelings (e.g., “I feel lonely”).

6. get_best_match_name(name: str, threshold: int = 87)
- Use to find the best match between the user requested song or album name with the information saved in the database.

7. get_database_info()
- Use to have the complete list of albums and song names stored available to seach.

You can only use songs and albums by Taylor Swift from the local Chroma vector database. Do not reference external artists or sources.

Guidelines:
- First understand the user’s intent (lyrics, album info, mood, meaning, etc.)
- Respond in a friendly, informed tone.
- Always include the song and album title if known.
- If you need to browse the available information in the database use `get_database_info()`.
- If nothing matches exactly use `get_best_match_name()`, offer the closest relevant results.
- If a query expresses a mood or feeling, use `classify_mood()` and then find songs that match using `get_song_match()`.
- If a query expresses a specific song lyrics requiest use  `get_lyrics_from_song_name().

Examples:
- "Give me the lyrics to 'All Too Well'" → get_complete_lyrics by song_name
- "What album is 'Enchanted' from?" → get_best_match_name by song name
- "I’m in love and want something romantic" → classify mood + get_song_match
- "Songs from '1989'?" → get_album_songs

Keep your answers emotionally aware and relevant to Taylor Swift’s discography only.
"""

## Tools

In [14]:
tools = [get_complete_lyrics, get_album_songs, 
         get_song_match, query_collection, classify_mood,
         get_best_match_name, get_database_info]

## Create Chat (For simple gradio interface)

In [15]:
def create_swiftie_chat(instructions, tool_list, model_name="gemini-2.0-flash-001"):    
    model = gen.GenerativeModel(
            model_name=model_name,
            system_instruction=instructions,
            tools=tool_list
        )
    chat = model.start_chat(history=[], enable_automatic_function_calling=True)
    return chat

swiftie_chat = create_swiftie_chat(system_instructions, tools)
def ask_a_swiftie(user_query, swiftie_chat): #instructions=system_instructions, tool_list=tools):
        #chat = create_swiftie_chat(instructions, tool_list)
        response = swiftie_chat.send_message(user_query)
        return response.text

def ask_a_swiftie_(query):
      return display(Markdown(ask_a_swiftie(query, swiftie_chat)))
      

In [16]:
#query="I need a song to motivates me to work, not to cry like happyness"
query="from which song is it the word champagne sea?" 
query="give all the albumns and versions the song All to Well " 

ask_a_swiftie_(query)

"All Too Well" appears in the following albums and versions:

*   **Red:** As "All Too Well"
*   **Red (Taylor's Version):**
    *   "All Too Well (Taylor's Version)"
    *   "All Too Well (10 Minute Version) (Taylor's Version) (From the Vault)"
*   **All Too Well: The Short Film (EP):** As "All Too Well (10 Minute Version)"
*   **Red (Taylor's Version): Could You Be The One Chapter**
*   **Red (Taylor's Version): From The Vault Chapter**
*   **Red (Taylor's Version): She Wrote A Song About Me Chapter**
*   **Red (Taylor's Version): The Slow Motion Chapter**

## Gradio (simple)

In [17]:
demo = gr.Interface(
    fn= ask_a_swiftie_, 
    inputs=[
        gr.Textbox(label="Input Query", lines=2, placeholder="Please switie, go ahead and ask me whatever you want, you know we talk a secreate langange we cant speak with anyone else")
    ], 
    outputs=[gr.Textbox(label="Swiftie Answer")
    ],
    title="Your Switfie BFF"
)

## Create Chat Interface (gpt like)

In [18]:
chat_session = None

def ask_a_swiftie_with_history(user_message, history):
    global chat_session

    if chat_session is None:
        chat_session = create_swiftie_chat(system_instructions, tools)

    response = chat_session.send_message(user_message)
    
    return response.text

In [None]:
demo = gr.ChatInterface(
    fn=ask_a_swiftie_with_history,
    title="Your Swiftie BFF 💖",
    description="Ask me anything about Taylor Swift songs, albums, moods or lyrics!",
    chatbot=gr.Chatbot(),
    textbox=gr.Textbox(placeholder="Ask me something like 'Give me a sad song like illicit affairs'...you know we talk a secret language we can't speak with anyone else"),
    theme="soft", 
    examples=["What album is 'Enchanted' from?", "I’m in love and want something romantic", "Give me the lyrics to 'All Too Well'"]
)

demo.launch(share=True)

  chatbot=gr.Chatbot(),


* Running on local URL:  http://127.0.0.1:7864
* Running on public URL: https://2f0fd905aa364fdce0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


