In [22]:
import os
import pinecone
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.schema import Document
from typing import List
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
#import fitz  # PyMuPDF

# Configuration Variables
PINECONE_API_KEY = 'eb82ab1d-ccf9-473c-9262-b2418b5b4282'
PINECONE_ENVIRONMENT = 'us-east-1'  # e.g., 'us-west1-gcp'
#PINECONE_INDEX_NAME = 'multilingual-rag'
HF_TOKEN = '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'
EMBEDDING_MODEL = 'paraphrase-xlm-r-multilingual-v1'  # Or any suitable embedding model




In [23]:

index_name="multilingual-rag"
pc = Pinecone(api_key='eb82ab1d-ccf9-473c-9262-b2418b5b4282')

#create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,#dimesion of dense vector
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )

In [4]:
# Initialize OpenAI Embeddings

embeddings=HuggingFaceEmbeddings(model_name="bert-base-multilingual-uncased")
embeddings


No sentence-transformers model found with name bert-base-multilingual-uncased. Creating a new one with mean pooling.


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='bert-base-multilingual-uncased', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [24]:
index=pc.Index(index_name)
index

<pinecone.data.index.Index at 0x17a32cd4f20>

In [25]:
def is_scanned_pdf(pdf_path: str) -> bool:
    """
    Determine if a PDF is scanned (image-based) or contains selectable text.
    """
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text and text.strip():
                return False
    return True

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file. Uses OCR if the PDF is scanned.
    """
    if is_scanned_pdf(pdf_path):
        print(f"Performing OCR on scanned PDF: {pdf_path}")
        images = convert_from_path(pdf_path)
        text = ""
        for img in images:
            text += pytesseract.image_to_string(img)
        return text
    else:
        print(f"Extracting text from digital PDF: {pdf_path}")
        text = ""
        # Use pdfplumber to extract text from a digital PDF
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text
def load_pdfs(pdf_dir: str) -> List[str]:
    """
    Load all PDF file paths from a directory.
    """
    pdf_files = []
    for root, dirs, files in os.walk(pdf_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

def create_documents(pdf_paths: List[str]) -> List[Document]:
    """
    Create a list of LangChain Document objects from PDF paths.
    """
    documents = []
    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path)
        documents.append(Document(page_content=text, metadata={"source": pdf_path}))
    return documents

def split_documents(documents: List[Document]) -> List[Document]:
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = []
    for doc in documents:
        split_docs.extend(splitter.split_documents([doc]))
    return split_docs



In [26]:
from transformers import BertTokenizer

# Load the tokenizer for 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [27]:
#Genrating sparse vector

In [28]:

from collections import Counter


def build_dict(input_batch):
    # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key-to-frequency values
        d = dict(Counter(token_ids))
        
        # filter out special tokens (101: CLS, 102: SEP, 103: MASK, 0: padding)
        filtered_tokens = {key: d[key] for key in d if key not in [101, 102, 103, 0]}
        
        # separate indices and values and cast them to the expected types
        indices = [int(key) for key in filtered_tokens.keys()]  # ensure indices are integers
        values = [float(val) for val in filtered_tokens.values()]  # ensure values are floats
        
        # append the sparse vectors to sparse_emb list in the correct format
        sparse_emb.append({'indices': indices, 'values': values})
    
    # return the sparse embeddings list
    return sparse_emb

def generate_sparse_vectors(context_batch):
    # tokenize the input batch
    inputs = tokenizer(
        context_batch, 
        padding=True, 
        truncation=True, 
        max_length=512
    )['input_ids']
    
    # create sparse dictionaries in the format required by Pinecone
    sparse_embeds = build_dict(inputs)
    
    return sparse_embeds


In [29]:
#Generating sparse + dense vector

In [30]:
from tqdm.auto import tqdm

def upsert_to_pinecone(split_docs):
    batch_size = 32

    for i in tqdm(range(0, len(split_docs), batch_size)):
        # find end of batch
        i_end = min(i+batch_size, len(split_docs))
        # extract batch
        context_batch = split_docs[i:i_end]
        
        # extract the actual text from the Document objects
        context_texts = [doc.page_content for doc in context_batch]
        
        # create unique IDs
        ids = [str(x) for x in range(i, i_end)]
        # add context passages as metadata
        meta = [{'context': context_text} for context_text in context_texts]
        
        # create dense vectors (no need to call .tolist())
        dense_embeds = embeddings.embed_documents(context_texts)
        
        # create sparse vectors
        sparse_embeds = generate_sparse_vectors(context_texts)

        vectors = []
        # loop through the data and create dictionaries for uploading documents to pinecone index
        for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
            vectors.append({
                'id': _id,
                'sparse_values': sparse,
                'values': dense,
                'metadata': metadata
            })

        # upload the documents to the new hybrid index
        index.upsert(vectors=vectors)

    # show index description after uploading the documents
    index.describe_index_stats()


In [31]:
"""Now we can query the index, providing the sparse and 
dense vectors of a question, along with a weight for keyword relevance (“alpha”). Alpha=1 will provide a purely semantic-based search result and 
alpha=0 will provide a purely keyword-based result equivalent to BM25. The default value is 0.5."""

'Now we can query the index, providing the sparse and \ndense vectors of a question, along with a weight for keyword relevance (“alpha”). Alpha=1 will provide a purely semantic-based search result and \nalpha=0 will provide a purely keyword-based result equivalent to BM25. The default value is 0.5.'

In [39]:
from pinecone_text.hybrid import hybrid_convex_scale

def hybrid_query(question, top_k, alpha):
    # convert the question into a sparse vector
    sparse_vec = generate_sparse_vectors([question])
    # convert the question into a dense vector
    dense_vec = embeddings.embed_query([question]).tolist()
    dense_vec, sparse_vector = hybrid_convex_scale(
        dense_vec, sparse_vec, alpha=alpha
    )
    # query pinecone with the query parameters
    result = index.query(
        vector=dense_vec,
        sparse_vector=sparse_vec[0],
        top_k=top_k,
        include_metadata=True,
      )
    # return search results as json
    return result

In [59]:
from pinecone_text.hybrid import hybrid_convex_scale

def hybrid_query(question, top_k, alpha):
    # Convert the question into a sparse vector
    sparse_vec = generate_sparse_vectors([question])
    
    # Convert the question into a dense vector
    dense_vec = embeddings.embed_query(question)  # No need for .tolist()

    # Check the type and structure of dense_vec and sparse_vec
    print(f'dense_vec: {dense_vec}, sparse_vec: {sparse_vec}')  # Debugging line

    # Ensure the sparse vector is in the expected format
    if isinstance(sparse_vec, list) and len(sparse_vec) > 0:
        sparse_vector = sparse_vec[0]  # Adjust as needed based on expected input
    else:
        raise ValueError("sparse_vec must be a non-empty list of dictionaries.")
    
    # Perform hybrid convex scaling
    scaled_dense, scaled_sparse = hybrid_convex_scale(
        dense_vec, sparse_vector, alpha=alpha
    )
    
    # Query Pinecone with the query parameters
    result = index.query(
        vector=scaled_dense,
        sparse_vector=scaled_sparse,
        top_k=top_k,
        include_metadata=True,
    )
    
    # Return search results as json
    return result


In [33]:
pdf_directory = r"E:\LLM projects\Multilingual_RAG\Advanced-RAG-with-multilingual-capabilities\sample_pdfs\en\test_pdf"  # Update this path
pdf_paths = load_pdfs(pdf_directory)
print(f"Found {len(pdf_paths)} PDF files.")    


Found 1 PDF files.


In [34]:

documents = create_documents(pdf_paths)
print(f"Extracted text and keywords from PDFs.")

Extracting text from digital PDF: E:\LLM projects\Multilingual_RAG\Advanced-RAG-with-multilingual-capabilities\sample_pdfs\en\test_pdf\The Alchemist by Paulo Coelho-1.pdf
Extracted text and keywords from PDFs.


In [35]:
# Split documents into chunks
split_docs = split_documents(documents)
print(f"Split documents into {len(split_docs)} chunks.")

Split documents into 261 chunks.


In [36]:
# Upsert documents to Pinecone
upsert_to_pinecone(split_docs)
print(f"Upserted all documents to Pinecone.")

100%|██████████| 9/9 [05:05<00:00, 33.95s/it]


Upserted all documents to Pinecone.


In [53]:
#Take user question
user_question = "What does Santiago notice in the sacristy of the abandoned church that he takes shelter in on the way to the merchant?"

In [51]:
# hybrid retriever
retriever_output = hybrid_query(user_question,3,0.5)
#print("Hybrid retriever initialized.")
print(retriever_output)

dense_vec: [-0.0874413475394249, 0.06685963273048401, -0.02957127057015896, 0.15570110082626343, -0.28925642371177673, 0.1513209342956543, 0.038793519139289856, 0.012834780849516392, -0.22768528759479523, 0.02760498784482479, -0.5841024518013, -0.15216900408267975, -0.1405993551015854, -0.19364595413208008, 0.3311733901500702, -0.007355878595262766, -0.013542771339416504, 0.14654631912708282, -0.24858324229717255, 0.13985984027385712, -0.08254257589578629, 0.16914409399032593, -0.22061055898666382, -0.02215919829905033, 0.5379860997200012, -0.23756997287273407, 0.0012062290916219354, 0.3764054775238037, -0.261172890663147, -0.09395284205675125, -0.08776575326919556, -0.06543240696191788, 0.06669039279222488, -0.09703323990106583, -0.16627539694309235, -0.2683883607387543, 0.028213748708367348, 0.5316920280456543, 0.06391129642724991, 0.1440470963716507, -0.0804431289434433, 0.32098588347435, -0.2043343186378479, 0.33363255858421326, -0.11518967151641846, -0.21422232687473297, 0.0101093

In [None]:
#Query Decomposition

In [13]:
# Install necessary packages if not already installed
# pip install transformers torch langchain sentencepiece langchain-huggingface

from langchain_huggingface import HuggingFacePipeline
from langchain import PromptTemplate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

# Load the mT5-1.2B model and tokenizer
model_name = "google/mt5-large"  # mT5-1.2B is equivalent to mt5-large
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32
)

# Create a text generation pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,          # Max length for generated text
    temperature=0.7,         # Adjust temperature for variability
    top_p=0.9,               # Nucleus sampling
    num_return_sequences=1,  # We only want one output
    truncation=True,         # Enable truncation
    device=-1                # Set to 0 if using GPU, -1 for CPU
)

# Wrap the pipeline with LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)
def query_decompose(question):
    # Define the refined prompt template for question decomposition
    template = """You are a helpful assistant that generates three distinct sub-questions related to the main question provided. 
    Each sub-question should focus on a specific aspect of the main question, and the sub-questions must be clear, concise, and relevant.

    Main Question: {question}

    Please list the sub-questions as a numbered list:
    1."""

    # Create the prompt template
    prompt_decomposition = PromptTemplate(
        input_variables=["question"],
        template=template
    )

    # Function to parse output into a list of sub-questions
    def parse_output(output):
        return output[0]['generated_text'].split('\n')

    # Create RunnableSequence for generating sub-questions
    generate_queries_decomposition = (
        prompt_decomposition | llm | parse_output
    )

    # Example usage
    if __name__ == "__main__":
        try:
            question = "What are the main components of an LLM-powered autonomous agent system?"
            questions = generate_queries_decomposition.invoke({"question": question})
            print("Generated Sub-Questions:")
            for idx, q in enumerate(questions, 1):
                print(f"{idx}. {q.strip()}")
        except Exception as e:
            print(f"An error occurred: {e}")


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
def main():
    # Path to the directory containing PDF files
    pdf_directory = "path_to_your_pdfs"  # Update this path
    
    # Load PDF file paths
    pdf_paths = load_pdfs(pdf_directory)
    print(f"Found {len(pdf_paths)} PDF files.")
    
    # Create Document objects with keywords
    documents = create_documents(pdf_paths)
    print(f"Extracted text and keywords from PDFs.")
    
    # Split documents into chunks
    split_docs = split_documents(documents)
    print(f"Split documents into {len(split_docs)} chunks.")
    
    # Upsert documents to Pinecone
    upsert_to_pinecone(split_docs)
    print(f"Upserted all documents to Pinecone.")

    #Take user question
    user_question = input("Enter your question: ")

    #Query Decompose
    sub_questions = query_decompose(user_question)
    
    # hybrid retriever
    retriever_output = hybrid_query(sub_questions,2,0.5)
    #print("Hybrid retriever initialized.")
    print(retriever_output)


if __name__ == "__main__":
    main()