In [24]:
import json
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import DeepLake

def create_db(dataset_path: str, json_filepath: str) -> DeepLake:
    with open(json_filepath, "r") as f:
        data = json.load(f)

    texts = []
    metadatas = []

    for movie, lyrics in data.items():
        for lyric in lyrics:
            texts.append(lyric["text"])
            metadatas.append(
                {
                    "movie": movie,
                    "name": lyric["name"]
                }
            )

    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

    db = DeepLake.from_texts(
        texts, embeddings, metadatas=metadatas, dataset_path=dataset_path
    )

    return db




In [9]:
import asyncio
import json
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Tuple, TypedDict

import aiohttp
from bs4 import BeautifulSoup
from IPython.display import clear_output

"""
This file scrapes disney songs + lyrics from "https://www.disneyclips.com/lyrics/"
"""

URL = "https://www.disneyclips.com/lyrics/"


async def get_lyrics_names_and_urls_from_movie_url(
    movie_name: str, url: str, session: aiohttp.ClientSession
) -> List[Tuple[str, str]]:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table", {"class": "songs"})
        names_and_urls = []
        if table:
            links = table.find_all("a")
            names_and_urls = []
            for link in links:
                names_and_urls.append(
                    (movie_name, link.text, f"{URL}/{link.get('href')}")
                )
        return names_and_urls


async def get_lyric_from_lyric_url(
    movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
) -> str:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
        paragraphs = div.find_all("p")
        text = ""
        # first <p> has the lyric
        p = paragraphs[0]
        for br in p.find_all("br"):
            br.replace_with(". ")
        for span in p.find_all("span"):
            span.decompose()
        text += p.text

        return (movie_name, lyric_name, text)


async def get_movie_names_and_urls(
    session: aiohttp.ClientSession,
) -> List[Tuple[str, str]]:
    async with session.get(URL) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        links = (
            soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
        )
        movie_names_and_urls = [
            (link.text, f"{URL}/{link.get('href')}") for link in links
        ]
        return movie_names_and_urls


async def scrape_disney_lyrics():
    async with aiohttp.ClientSession() as session:
        data = await get_movie_names_and_urls(session)
        data = await asyncio.gather(
            *[
                asyncio.create_task(
                    get_lyrics_names_and_urls_from_movie_url(*el, session)
                )
                for el in data
            ]
        )
        data = await asyncio.gather(
            *[
                asyncio.create_task(get_lyric_from_lyric_url(*data, session))
                for data in chain(*data)
            ]
        )

        result = defaultdict(list)

        for movie_name, lyric_name, lyric_text in data:
            result[movie_name].append({"name": lyric_name, "text": lyric_text})

        with open("data/lyrics.json", "w") as f:
            json.dump(result, f)
        
        clear_output(wait=True)  # Clear previous output for better readability
        print("Scraping completed!") 

if __name__ == "__main__":
    await scrape_disney_lyrics()

Scraping completed!


In [23]:
from dotenv import load_dotenv

load_dotenv()

import json
from collections import defaultdict
from pathlib import Path

from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["song"],
    template="""I am building a retrieval system. Given the following song lyric

                {song}

                You are tasked to produce a list of 8 emotions that I will later use to retrieve the song. 

                Please provide only a list of comma separated  emotions,
                """)

llm = ChatOpenAI(temperature=0.7)

chain = LLMChain(llm=llm, prompt=prompt)

with open("data/lyrics.json", "r") as f:
    data = json.load(f)

new_data = defaultdict(list)

for movie, songs in data.items():
    for song in songs:
        print(f"{song['name']}")
        emotions = chain.run(song=song["text"])
        new_data[movie].append(
            {"name": song["name"], "text": emotions}
        )


with open("data/emotions.json", "w") as f:
    json.dump(new_data, f)

Cruella De Vil
Dalmatian Plantation
Kanine Krunchies
I See Spots
Thunderbolt Adventure Hour
Try Again
Friend Like Me
Arabian Nights
A Whole New World
A Whole New World (Single)
One Jump Ahead
One Jump Ahead (Reprise)
Prince Ali
Prince Ali (Reprise)
Arabian Nights (2019)
A Whole New World
A Whole New World (End Title)
Friend Like Me
Friend Like Me (End Title)
One Jump Ahead
One Jump Ahead (Reprise)
One Jump Ahead (Reprise 2)
Prince Ali
Speechless (Part 1)
Speechless (Part 2)
Speechless (Full)
Arabian Nights
Forget About Love
I'm Looking out for Me
Nothing in the World (Quite Like a Friend)
You're Only Second Rate
Are You in or Out?
Father and Son
Out of Thin Air
There's a Party Here in Agrabah
Welcome to the Forty Thieves
Alice in Wonderland
All in the Golden Afternoon
I'm Late
In a World of My Own
Old Father William
Painting the Roses Red
Smoke the Blighter Out
The Caucus Race
The Un-birthday Song
The Walrus and the Carpenter
'Twas Brillig
Very Good Advice
Ev'rybody Wants to Be a Cat
S

In [26]:
db = create_db("hub://gustavobarbosa060/songs", "data/emotions.json")

  warn_deprecated(


Your Deep Lake dataset has been successfully created!


Creating 865 embeddings in 2 batches of size 500:: 100%|██████████| 2/2 [00:45<00:00, 22.51s/it]

Dataset(path='hub://gustavobarbosa060/songs', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
   text       text      (865, 1)      str     None   
 metadata     json      (865, 1)      str     None   
 embedding  embedding  (865, 1536)  float32   None   
    id        text      (865, 1)      str     None   





In [76]:
def load_db(dataset_path: str, *args, **kwargs):
    db = DeepLake(dataset_path, *args, **kwargs)
    return db

db = load_db("hub://gustavobarbosa060/songs")


Deep Lake Dataset in hub://gustavobarbosa060/songs already exists, loading from the storage


In [77]:
template = """
We have a simple song retrieval system. It accepts eight emotions. You are tasked to suggest between 1 and 4 emotions to match the users' feelings. Suggest more emotions for longer sentences and just one or two for small ones, trying to condense the central theme of the input.

Examples:

Input: "I had a great day!" 
"Joy"
Input: "I am exhausted today and not feeling well."
"Exhaustion, Discomfort, and Fatigue"
Input: "I am in Love"
"Love"

Please, suggest emotions for input = "{user_input}", and reply ONLY with a list of emotions/feelings/vibes.
"""
prompt = PromptTemplate(
    input_variables=["user_input"],
    template=template)

user_input = "I am lovely"
llm = ChatOpenAI(temperature=0.3)
chain = LLMChain(llm=llm, prompt=prompt)

emotions = chain.run(user_input=user_input)


In [78]:
emotions

'Love, Happiness'

In [83]:
matches = db.similarity_search_with_score(emotions, k=20, embedding_function=OpenAIEmbeddings(model="text-embedding-ada-002"))

In [84]:
matches

[(Document(page_content='Love, happiness, joy, appreciation, gratitude, affection, devotion, contentment', metadata={'movie': 'Enchanted', 'name': "That's How You Know"}),
  array([0.9171374 , 0.9165297 , 0.9133286 , 0.9092497 , 0.90879774,
         0.90755975, 0.9072787 , 0.907129  , 0.90705067, 0.9065594 ,
         0.9064075 , 0.9052591 , 0.9045871 , 0.9030976 , 0.90124893,
         0.8998287 , 0.89977264, 0.89960986, 0.8983366 , 0.8980478 ],
        dtype=float32))]

In [67]:
def filter_scores(matches, th: float = 0.8):
    return [(doc, score) for (doc, score) in matches if score > th]

matches = filter_scores(matches, 0.8)


In [68]:
def normalize_scores_by_sum(matches):

    scores = [score for _, score in matches]

    tot = sum(scores)

    return [(doc, (score / tot)) for doc, score in matches]

In [52]:
matches

[(Document(page_content='Joy, Happiness, Cheer, Laughter, Friendliness, Glee, Excitement, Contentment', metadata={'movie': 'The Tigger Movie', 'name': 'How to Be a Tigger'}),
  array([0.8688696 , 0.8670324 , 0.865265  , 0.86473536, 0.8646316 ,
         0.86405665, 0.8640162 , 0.86317927, 0.86227465, 0.8605438 ,
         0.8604622 , 0.8582515 , 0.85813713, 0.8580375 , 0.85719115,
         0.85702854, 0.8568917 , 0.8566492 , 0.85611606, 0.85605484,
         0.855812  , 0.8557356 , 0.8557034 , 0.85538286, 0.85522586,
         0.8551453 , 0.85500234, 0.85499215, 0.8542557 , 0.8542142 ,
         0.854042  , 0.85402805, 0.8538721 , 0.8537415 , 0.85345936,
         0.853087  , 0.8529779 , 0.85290104, 0.8527557 , 0.852356  ,
         0.85233456, 0.8521059 , 0.8520455 , 0.8519777 , 0.85186535,
         0.851691  , 0.85157853, 0.85137445, 0.8513469 , 0.85112107,
         0.8508022 , 0.8506134 , 0.85061085, 0.850269  , 0.85020155,
         0.8499129 , 0.84983844, 0.8494013 , 0.8493786 , 0.8493615