In [1]:
import os
pdf_path = os.path.join('..', 'data')

In [2]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader

def load_documets():
    document = PyPDFDirectoryLoader(pdf_path).load()
    return document


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512,
                    chunk_overlap=50,
                    length_function=len,
                    is_separator_regex=False
                    )
    return text_splitter.split_documents(documents)


In [4]:
docs = load_documets()
docs[0].metadata

{'producer': 'pdfTeX-1.40.17',
 'creator': 'LaTeX with acmart 2020/04/30 v1.71 Typesetting articles for the Association for Computing Machinery and hyperref 2016/06/24 v6.83q Hypertext links for LaTeX',
 'creationdate': '2020-07-16T00:20:53+00:00',
 'author': 'Yi Ren1*, Xu Tan2*, Tao Qin2, Jian Luan3, Zhou Zhao1, Tie-Yan Liu2',
 'keywords': 'singing voice synthesis, singing data mining, web crawling, lyrics-to-singing alignment',
 'moddate': '2020-07-16T00:20:53+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2',
 'subject': '-  Computing methodologies  ->  Natural language processing.-  Applied computing  ->  Sound and music computing.',
 'title': 'DeepSinger: Singing Voice Synthesis with Data Mined From the Web',
 'trapped': '/False',
 'source': '../data/DeepSinger.pdf',
 'total_pages': 12,
 'page': 0,
 'page_label': '1'}

In [5]:
chunks = split_documents(docs)
chunks[0]

Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with acmart 2020/04/30 v1.71 Typesetting articles for the Association for Computing Machinery and hyperref 2016/06/24 v6.83q Hypertext links for LaTeX', 'creationdate': '2020-07-16T00:20:53+00:00', 'author': 'Yi Ren1*, Xu Tan2*, Tao Qin2, Jian Luan3, Zhou Zhao1, Tie-Yan Liu2', 'keywords': 'singing voice synthesis, singing data mining, web crawling, lyrics-to-singing alignment', 'moddate': '2020-07-16T00:20:53+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '-  Computing methodologies  ->  Natural language processing.-  Applied computing  ->  Sound and music computing.', 'title': 'DeepSinger: Singing Voice Synthesis with Data Mined From the Web', 'trapped': '/False', 'source': '../data/DeepSinger.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='DeepSinger: Singing Voice Synthesis with Data Mined\nFrom the Web\nYi Ren1∗, Xu

In [6]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [7]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embeddings_func():
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    return embeddings

In [8]:
from langchain.vectorstores.chroma import Chroma

def add_to_chroma(chunks: list[Document]):
    db = Chroma(

        persist_directory= "chroma", embedding_function=get_embeddings_func()
    )

    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [9]:
add_to_chroma(chunks)

  embeddings = OllamaEmbeddings(model='nomic-embed-text')
  db = Chroma(


Number of existing documents in DB: 0
👉 Adding new documents: 203


  db.persist()


In [10]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embeddings_func()
    db = Chroma(persist_directory='chroma', embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="mistral")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    # print(formatted_response)
    return response_text

In [11]:
response = query_rag("What the Deep singer paper about?")
print(response)

  model = Ollama(model="mistral")


 The DeepSinger paper is about developing a system that can synthesize high-quality singing voices from data mined from music websites. It addresses the challenges of creating an SVS (Singling Voice Synthesis) system by designing a pipeline with several data mining and modeling steps, including data crawling, singing and accompaniment separation, lyrics-to-singing alignment, data filtration, and singing modeling. The contributions of this paper include the creation of the first SVS system mined from music websites and the ability to synthesize singing voices in multiple languages and for multiple singers.
