In [1]:
from langchain import PromptTemplate
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate

# Load the pdf 

In [2]:
local_path = "/path/to/pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

## Load the model using LLamaCPP 

In [3]:
model_path = "/path/to/gguf/model.gguf" 

In [4]:
# Make sure the model path is correct for your system!
#You can download model file from HuggingFace or u can copy the model file from ollama models path then change the name to .gguf 
llm = LlamaCpp(
    model_path=model_path, verbose=False,n_ctx=4096,
)

In [5]:
# Split and chunk 
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=64)
chunks = text_splitter.split_documents(data)

# Embedding and Vector DB

In [6]:
%%time
# Load embedding model 
from langchain.embeddings import SentenceTransformerEmbeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_llm = HuggingFaceEmbeddings(model_name=model_name)



CPU times: user 566 ms, sys: 210 ms, total: 776 ms
Wall time: 5.35 s


In [7]:
%%time
vector_db_Chroma = Chroma.from_documents(
    documents=chunks, 
                    embedding=embedding_llm,
    collection_name="ragdb",
)

CPU times: user 6.36 s, sys: 406 ms, total: 6.77 s
Wall time: 3.89 s


# Prompt and Response 

In [8]:
# RAG prompt
template = """Answer the question based ONLY on the following context then summarize the context to get the answer and If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
The following context:
{context}
Question: {question}
"""

prompt_Template = ChatPromptTemplate.from_template(template)
chain = prompt_Template | llm

In [9]:
# To stream the output from llm 
def stream_output(chain,question):
    stream = chain.stream(question)
    for response in stream:
        print(response, end='')  # Streamed response piece by piece

# Exmaples 

In [10]:
%%time
question = "What is types of Audio  Deepfake Deepfake Attacks"
similar_docs = vector_db_Chroma.similarity_search(question, k=3) # get similer docments form vdb
retriever = [ chunk.page_content for chunk in similar_docs ]
stream_output(chain , {'context': retriever ,"question": question})

Answer: 

The types of audio deepfakes attacks are as follows: 

* **Imitation-based:** This method involves transforming secret audio (the original) to sound like another speech (target audio). It can be done by using human voices with similar tones and inflections or by masking algorithms. For instance, Efficient Wavelet Mask (EWM) is an algorithm that transforms the signal of the original audio to mimic the target audio. 
* **Synthetic-based:** This technique aims to transform text into natural speech in real time through a TTS (Text-to-Speech) system.  The process involves using pre-trained models, such as Tactoran 2, Deep Voice 3, and FastSpeech 2 to generate synthetic audio. These models are trained on large datasets of clean recordings and use their knowledge to produce high-quality results.
* **Replay-based:** This is the type of attack that involves replaying a recording of the target speaker's voice and mimicking it through various techniques like far-field detection or cut-a

In [11]:
%%time
question = "What is types Challenges in this research?"
similar_docs = vector_db_Chroma.similarity_search(question, k=3) # get similer docments form vdb
retriever = [ chunk.page_content for chunk in similar_docs ]
stream_output(chain , {'context': retriever ,"question": question})

Answer: The context provided details several challenges in the field of Audio Deepfakes (AD). Here's a breakdown:

**1. Limited AD Detection Methods for Non-English Languages:** Most existing research focuses on English-speaking voices, neglecting other languages like Arabic. This poses a significant challenge in developing robust AD detection methods that can effectively identify and classify fakeness across various linguistic backgrounds.
**2. Lack of Accent Assessment in Existing AD Detection Methods:**  Current AD detection methods primarily rely on identifying the type of fake itself without considering nuances related to accent, tone, and other factors affecting audio authenticity. This approach limits their overall accuracy and effectiveness in accurately detecting real versus fake audio.

These are just two major challenges highlighted in the text. The document also mentions the need for improved data collection, better model training techniques, and more comprehensive evaluati

In [12]:
%%time
question = "Sort the Datasets for Fake Audio Detection "
similar_docs = vector_db_Chroma.similarity_search(question, k=3) # get similer docments form vdb
retriever = [ chunk.page_content for chunk in similar_docs ]
stream_output(chain , {'context': retriever ,"question": question})

**Answer:**

Based on the provided text, here are the datasets sorted by the types of fake audio they address: 

* **M-AILABS Speech:** A German audio dataset designed for speech recognition and synthetic audio, containing real samples of varying lengths.
* **Baidu Silicon Valley AI Lab cloned audio:**  Generated from a neural voice cloning tool, it contains high-quality multi-speaker audio clips in various formats.
* **Fake or Real (FoR):** A dataset released in 2019 that includes 8 synthetic voices generated by DeepVoice3 and Google-WavNet, providing data for detecting fake audio samples in multiple formats (MP3, WAV).
* **Ar-DAD Arabic Diversified Audio:** A dataset focused on Arabic speakers' voices. It contains both real and imitated voices from the Quran reciters. 
* **H-Voice:** This dataset is based on imitation and synthetic voices speaking in various languages such as Spanish, English, Portuguese, French, and Tagalog. 
* **ASV Spoof 2021 Challenge Dataset:** A publicly availa