<a href="https://colab.research.google.com/github/Greeshmasindhu24/DocumentRetrival-Using-LangChain-/blob/main/7_Document_Query_Retrieval_and_Vector_Database_using_LangChain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import fitz
import pdfplumber
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print("Error with PyMuPDF, trying pdfplumber:", e)
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        except Exception as e:
            print("Error extracting text from PDF:", e)
    return text

def store_in_vector_db(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_text(text)

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vector_store = FAISS.from_texts(texts, embedding_model)
    vector_store.save_local("faiss_index")
    print("Vector database saved!")
    return vector_store

def query_vector_db(query, k=1):
    try:
        embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vector_store = FAISS.load_local(
            "faiss_index",
            embedding_model,
            allow_dangerous_deserialization=True
        )

        try:
            results = vector_store.similarity_search_with_score(query, k=1)
            formatted_results = []
            for doc, score in results:
                if hasattr(score, 'item'):
                    score = score.item()
                similarity = 1 / (1 + score)

                formatted_results.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "similarity_score": float(similarity)
                })
        except (AttributeError, TypeError) as e:
            print(f"Using standard similarity search: {e}")
            results = vector_store.similarity_search(query, k=1)
            formatted_results = []
            for i, doc in enumerate(results):
                similarity = 0.1 - (i * 0.1)
                formatted_results.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "similarity_score": float(similarity)
                })
        formatted_results.sort(key=lambda x: x["similarity_score"], reverse=True)
        return json.dumps(formatted_results, indent=2, cls=NumpyEncoder)

    except Exception as e:
        print(f"Error querying vector database: {e}")
        return json.dumps({"error": str(e)})

def main():
    pdf_path = "/content/drive/MyDrive/Hands on/power-bi-question.pdf"
    if not os.path.exists(pdf_path):
        print("File not found!")
        return

    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    if text:
        print("Text extracted successfully!")
        print("Storing in vector database...")
        store_in_vector_db(text)
        query = "what is power bi?"
        print("Querying: ", query)
        results_json = query_vector_db(query)
        print(results_json)
    else:
        print("Failed to extract text.")

if __name__ == "__main__":
    main()

Extracting text from PDF...
Text extracted successfully!
Storing in vector database...
Vector database saved!
Querying:  what is power bi?
[
  {
    "content": "addressed in Power BI?\nMany-to-many relationships comprise a bridge table showing the combinations of two\ndimensions. These combinations can either be possible or those that have occurred.\n\u25cf\nBi-directional cross-filtering relationships can be used in PBIX.\n\u25cf\nDAX is used per metric to check or modify filter context.\n\u25cf\nCROSSFILTER is used in Power Pivot in Excel.\n46. What is the difference between a Power BI Dataset, a Report,\nand a Dashboard?\nPower BI Dataset\nReport\nDashboard\nThe\nsource\nto",
    "metadata": {},
    "similarity_score": 0.1147263867319606
  }
]
