In [1]:
import asyncio
import json
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Tuple, TypedDict

import aiohttp
from bs4 import BeautifulSoup

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append('/home/jupyter/self_learning/Langchain/code/llama.cpp')

In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [5]:
import os
os.environ['OPENAI_API_KEY'] = ""
os.environ["ACTIVELOOP_TOKEN"] = ""
os.environ["GOOGLE_API_KEY"]= ""
os.environ["GOOGLE_CSE_ID"]= ""
os.environ["HUGGINGFACEHUB_API_TOKEN"]= ""
os.environ["COHERE_API_KEY"] = ""
os.environ["WOLFRAM_ALPHA_APPID"] = ""
os.environ["SERPAPI_API_KEY"]= ""

In [6]:
"""
This file scrapes disney songs + lyrics from "https://www.disneyclips.com/lyrics/"
"""

URL = "https://www.disneyclips.com/lyrics/"

In [None]:
async def get_lyrics_names_and_urls_from_movie_url(
    movie_name: str, url: str, session: aiohttp.ClientSession
) -> List[Tuple[str, str]]:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table", {"class": "songs"})
        names_and_urls = []
        if table:
            links = table.find_all("a")
            names_and_urls = []
            for link in links:
                names_and_urls.append(
                    (movie_name, link.text, f"{URL}/{link.get('href')}")
                )
        return names_and_urls


In [None]:
async def get_lyric_from_lyric_url(
    movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
) -> str:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
        paragraphs = div.find_all("p")
        text = ""
        # first <p> has the lyric
        p = paragraphs[0]
        for br in p.find_all("br"):
            br.replace_with(". ")
        for span in p.find_all("span"):
            span.decompose()
        text += p.text

        return (movie_name, lyric_name, text)

In [None]:
async def get_movie_names_and_urls(
    session: aiohttp.ClientSession,
) -> List[Tuple[str, str]]:
    async with session.get(URL) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        links = (
            soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
        )
        movie_names_and_urls = [
            (link.text, f"{URL}/{link.get('href')}") for link in links
        ]
        return movie_names_and_urls


In [None]:
async def scrape_disney_lyrics():
    async with aiohttp.ClientSession() as session:
        data = await get_movie_names_and_urls(session)
        data = await asyncio.gather(
            *[
                asyncio.create_task(
                    get_lyrics_names_and_urls_from_movie_url(*el, session)
                )
                for el in data
            ]
        )
        data = await asyncio.gather(
            *[
                asyncio.create_task(get_lyric_from_lyric_url(*data, session))
                for data in chain(*data)
            ]
        )

        result = defaultdict(list)

        for movie_name, lyric_name, lyric_text in data:
            result[movie_name].append({"name": lyric_name, "text": lyric_text})

        with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics.json", "w") as f:
            json.dump(result, f)


In [None]:
import nest_asyncio
nest_asyncio.apply()
loop = asyncio.get_event_loop()
loop.run_until_complete(scrape_disney_lyrics())
# asyncio.run(scrape_disney_lyrics())

In [None]:
from dotenv import load_dotenv

load_dotenv()
import json
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
import chromadb
from langchain.embeddings import CohereEmbeddings
# MODEL_ID = "text-embedding-ada-002"
EMBED_MODEL = CohereEmbeddings(model="embed-multilingual-v2.0")
COLLECTION_NAME = "disney-lyrics"
# DATASET_ID = "disney-lyrics-emotions"

In [None]:
def create_db(chromadb_collection_name: str, json_filepath: str):
    with open(json_filepath, "r") as f:
        data = json.load(f)

    texts = []
    metadatas = []

    for movie, lyrics in data.items():
        for lyric in lyrics:
            texts.append(lyric["text"])
            metadatas.append(
                {
                    "movie": movie,
                    "name": lyric["name"],
                    "embed_url": lyric["text"],
                }
            )

#     embeddings = OpenAIEmbeddings(model=MODEL_ID)
    client = chromadb.Client()
    db = Chroma.from_texts(texts, EMBED_MODEL, client=client, metadatas=metadatas, collection_name=chromadb_collection_name)
    print("There are", db._collection.count(), "in the collection")
    return db

In [None]:
# def load_db(dataset_path: str, *args, **kwargs) -> DeepLake:
#     db = DeepLake(dataset_path, *args, **kwargs)
#     return db

In [None]:
if __name__ == "__main__":
#     dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{DATASET_ID}"
    create_db(COLLECTION_NAME, "/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics.json")

In [None]:
client = chromadb.Client()
collection = client.get_collection(name=COLLECTION_NAME, embedding_function= EMBED_MODEL)
print("There are", collection.count(), "in the collection")

In [None]:
# collection.get(include=['documents'])

In [None]:
# """
# This script will keep only the songs that are in the Spotify "Disney Hits" playlist
# """
# from dotenv import load_dotenv

# load_dotenv()
# import json
# from collections import defaultdict

# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials

# name = "Disney hits"

# spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())
# results = spotify.search(q="playlist:" + name, type="playlist", limit=5)
# items = results["playlists"]["items"]

# uri = "spotify:playlist:37i9dQZF1DX8C9xQcOrE6T"
# playlist = spotify.playlist(uri)

# with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics.json", "r") as f:
#     data = json.load(f)

# spotify_tracks = {}

# for item in playlist["tracks"]["items"]:
#     track = item["track"]
#     track_name = track["name"].lower().split("-")[0].strip()
#     print(track_name)
#     spotify_tracks[track_name] = {
#         "id": track["id"],
#         "embed_url": f"https://open.spotify.com/embed/track/{track['id']}?utm_source=generator",
#     }

# # here we add only songs that are in the Disney spotify playlist

# data_filtered = defaultdict(list)
# tot = 0
# for movie, lyrics in data.items():
#     for lyric in lyrics:
#         name = lyric["name"].lower()
#         if name in spotify_tracks:
#             data_filtered[movie].append(
#                 {**lyric, **{"embed_url": spotify_tracks[name]["embed_url"]}}
#             )
#             tot += 1
# print(tot)

# with open("data/lyrics_with_spotify_url.json", "w") as f:
#     json.dump(data_filtered, f)

In [None]:
"""
This script takes all the songs we have and use the lyric to create a list of 8 emotions we then use to replace the lyric itself.
This is needed to properly match user's emotions to the songs.
"""
from langchain_community.chat_models import ChatCohere
import json
from collections import defaultdict
from pathlib import Path

from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

prompt = PromptTemplate(
    input_variables=["song"],
    template=Path("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/summary_with_emotions.prompt").read_text(),
)

# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
# llm= ChatCohere(model="command", temperature=0.7)

llm = LlamaCpp(
    model_path="/home/jupyter/self_learning/Langchain/code/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    n_gpu_layers=1,
    n_batch=512,
    n_ctx=32768,
    f16_kv=True,
#     callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

chain = LLMChain(llm=llm, prompt=prompt)

with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics.json", "r") as f:
    data = json.load(f)

new_data = defaultdict(list)

for movie, songs in data.items():
    for song in songs:
        print(f"{song['name']}")
        emotions = chain.run(song=song["text"])
        new_data[movie].append(
            {"name": song["name"], "text": emotions, "lyrics": song["text"]}
        )
        with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/emotions_with_spotify_url_mistral_7b.json", "w") as f:
            json.dump(new_data, f)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /home/jupyter/self_learning/Langchain/code/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 1

Cruella De Vil



llama_print_timings:        load time =    9266.68 ms
llama_print_timings:      sample time =      44.22 ms /    76 runs   (    0.58 ms per token,  1718.56 tokens per second)
llama_print_timings: prompt eval time =    9266.24 ms /   199 tokens (   46.56 ms per token,    21.48 tokens per second)
llama_print_timings:        eval time =    6676.94 ms /    75 runs   (   89.03 ms per token,    11.23 tokens per second)
llama_print_timings:       total time =   16213.85 ms /   274 tokens
Llama.generate: prefix-match hit


Dalmatian Plantation



llama_print_timings:        load time =    9266.68 ms
llama_print_timings:      sample time =      22.43 ms /    40 runs   (    0.56 ms per token,  1783.17 tokens per second)
llama_print_timings: prompt eval time =    3560.40 ms /    83 tokens (   42.90 ms per token,    23.31 tokens per second)
llama_print_timings:        eval time =    3576.30 ms /    39 runs   (   91.70 ms per token,    10.91 tokens per second)
llama_print_timings:       total time =    7275.85 ms /   122 tokens
Llama.generate: prefix-match hit


Kanine Krunchies



llama_print_timings:        load time =    9266.68 ms
llama_print_timings:      sample time =      25.81 ms /    44 runs   (    0.59 ms per token,  1704.83 tokens per second)
llama_print_timings: prompt eval time =    6410.05 ms /   149 tokens (   43.02 ms per token,    23.24 tokens per second)
llama_print_timings:        eval time =    3833.21 ms /    43 runs   (   89.14 ms per token,    11.22 tokens per second)
llama_print_timings:       total time =   10396.75 ms /   192 tokens
Llama.generate: prefix-match hit


I See Spots


In [None]:
"""
This script takes all the songs we have and create a summary for each lyric
"""

from pprint import pprint

prompt = PromptTemplate(
    input_variables=["song"],
    template=Path("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/summary.prompt").read_text(),
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.0)

chain = LLMChain(llm=llm, prompt=prompt)

with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics.json", "r") as f:
    data = json.load(f)

lyrics_summaries = {}

for movie, lyrics in data.items():
    for lyric in lyrics:
        print(f"Creating summary for {lyric['name']}")
        summary = chain.run(song=lyric["text"])
        lyrics_summaries[lyric["name"].lower()] = {
            "summary": summary,
            "text": lyric["text"]}
        with open("/home/jupyter/self_learning/Langchain/code/fairytaleDJ/data/lyrics_with_summary.json","w") as f:
            json.dump(lyrics_summaries, f)

pprint(lyrics_summaries)