In [20]:
import os
from llama_index.core import VectorStoreIndex, Document, ServiceContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.storage import StorageContext
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
import chromadb
import uuid

In [2]:
tex = """
In this paper1, we develop DeepSinger, a multi-lingual multi-singer
singing voice synthesis (SVS) system, which is built from scratch us-
ing singing training data mined from music websites. The pipeline
of DeepSinger consists of several steps, including data crawling,
singing and accompaniment separation, lyrics-to-singing align-
ment, data filtration, and singing modeling. Specifically, we design
a lyrics-to-singing alignment model to automatically extract the
duration of each phoneme in lyrics starting from coarse-grained
sentence level to fine-grained phoneme level, and further design a
multi-lingual multi-singer singing model based on a feed-forward
Transformer to directly generate linear-spectrograms from lyrics,
and synthesize voices using Griffin-Lim. DeepSinger has several
advantages over previous SVS systems: 1) to the best of our knowl-
edge, it is the first SVS system that directly mines training data
from music websites, 2) the lyrics-to-singing alignment model fur-
ther avoids any human efforts for alignment labeling and greatly
reduces labeling cost, 3) the singing model based on a feed-forward
Transformer is simple and efficient, by removing the complicated
acoustic feature modeling in parametric synthesis and leveraging
a reference encoder to capture the timbre of a singer from noisy
singing data, and 4) it can synthesize singing voices in multiple
languages and multiple singers. We evaluate DeepSinger on our
mined singing dataset that consists of about 92 hours data from 89
singers on three languages (Chinese, Cantonese and English). The
results demonstrate that with the singing data purely mined from
the Web, DeepSinger can synthesize high-quality singing voices in
terms of both pitch accuracy and voice naturalness2.
"""

docs = [
        Document(text=tex, metdata={'Title':'DeepSinger: Singing Voice Synthesis with Data Mined From the Web'})    
]

In [5]:
embeddings = OllamaEmbedding(model_name='nomic-embed-text')

In [7]:
chroma_client = chromadb.PersistentClient(path='../POC')
vector_store = ChromaVectorStore(chroma_collection= chroma_client, collection_name='quickstart')

In [8]:
index = VectorStoreIndex.from_documents(documents=docs, embed_model=embeddings, vector_store=vector_store)

In [11]:
retriever = index.as_retriever(similarity_top_k=3)

In [18]:
llm = Ollama(model="mistral:7b")
query_engine = RetrieverQueryEngine.from_args(retriever=retriever, llm=llm)

In [19]:
query = "what is deep singer"
response = query_engine.query(query)
print(f"\nQ: {query}\nA: {response}")


Q: what is deep singer
A:  DeepSinger refers to a multi-lingual multi-singer singing voice synthesis (SVS) system, which is built from scratch using singing training data mined from music websites. The system has a pipeline that includes several steps such as data crawling, singing and accompaniment separation, lyrics-to-singing alignment, data filtration, and singing modeling. DeepSinger directly generates linear spectrograms from lyrics and synthesizes voices using Griffin-Lim. It is designed to avoid human efforts for alignment labeling and greatly reduce labeling cost, as well as to synthesize singing voices in multiple languages and multiple singers. It has been evaluated on a dataset of approximately 92 hours of data from 89 singers across three languages (Chinese, Cantonese, and English). The results indicate that DeepSinger can produce high-quality singing voices in terms of both pitch accuracy and voice naturalness.
