In [None]:
import json
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings  # Using Hugging Face embeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# ---------------------------
# 1. PDF Processing
# ---------------------------

def process_pdf(pdf_path, chunk_size=1000, chunk_overlap=100):
    """
    Loads a PDF file, extracts its text, and splits the text into chunks.
    """
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    
    return docs

# ---------------------------
# 2. JSON Processing
# ---------------------------

def flatten_json(y):
    """
    Flattens a nested JSON/dictionary.
    """
    out = {}
    
    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x  # remove the trailing underscore
            
    flatten(y)
    return out

def process_json(json_path):
    """
    Loads a JSON file, extracts and normalizes text fields.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    texts = []
    if isinstance(data, list):
        for entry in data:
            flat_entry = flatten_json(entry)
            combined_text = " ".join(str(v) for v in flat_entry.values() if isinstance(v, str))
            texts.append(combined_text)
    elif isinstance(data, dict):
        flat_entry = flatten_json(data)
        combined_text = " ".join(str(v) for v in flat_entry.values() if isinstance(v, str))
        texts.append(combined_text)
    
    # Convert texts into Document objects (for consistency with LangChain)
    documents = [Document(page_content=text) for text in texts]
    return documents

# ---------------------------
# 3. Build a Vector Store for Efficient Retrieval Using Hugging Face Embeddings
# ---------------------------

def build_vector_store(documents):
    """
    Converts documents into embeddings using a Hugging Face model and stores them in a FAISS vector store.
    """
    # Initialize the embedding model using a SentenceTransformer model (e.g., 'all-MiniLM-L6-v2')
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Build the FAISS index from the documents
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

# ---------------------------
# 4. Example Usage
# ---------------------------

if __name__ == '__main__':
    # Process PDF file
    pdf_file_path = '/kaggle/input/iba-dataset-pdf/pa-2024-25.pdf'  # Replace with your PDF file path
    pdf_docs = process_pdf(pdf_file_path)
    
    # Process JSON file
    json_file_path = '/kaggle/input/iba-dataset-json/courses_info.json'  # Replace with your JSON file path
    json_docs = process_json(json_file_path)
    
    # Combine documents from both sources
    all_documents = pdf_docs + json_docs
    
    # Build the vector store for efficient retrieval
    vector_store = build_vector_store(all_documents)

In [None]:
save_path = "faiss_index"
vector_store.save_local(save_path)
print(f"Vector store saved to '{save_path}'.")

In [None]:
print("\nFAISS Index Details:")
# Print the index object
print(vector_store.index)
# Print the total number of vectors in the index
print("Number of vectors in the index:", vector_store.index.ntotal)

In [None]:
import json
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings  # Using Hugging Face embeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# ---------------------------
# 1. PDF Processing
# ---------------------------
def process_pdf(pdf_path, chunk_size=1000, chunk_overlap=100):
    """
    Loads a PDF file, extracts its text, and splits the text into chunks.
    """
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    
    return docs

# ---------------------------
# 2. JSON Processing
# ---------------------------
def flatten_json(y):
    """
    Flattens a nested JSON/dictionary.
    """
    out = {}
    
    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x  # remove the trailing underscore
            
    flatten(y)
    return out

def process_json(json_path):
    """
    Loads a JSON file, extracts and normalizes text fields.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    texts = []
    if isinstance(data, list):
        for entry in data:
            flat_entry = flatten_json(entry)
            combined_text = " ".join(str(v) for v in flat_entry.values() if isinstance(v, str))
            texts.append(combined_text)
    elif isinstance(data, dict):
        flat_entry = flatten_json(data)
        combined_text = " ".join(str(v) for v in flat_entry.values() if isinstance(v, str))
        texts.append(combined_text)
    
    # Convert texts into Document objects (for consistency with LangChain)
    documents = [Document(page_content=text) for text in texts]
    return documents

# ---------------------------
# 3. Build a Vector Store for Efficient Retrieval Using Hugging Face Embeddings
# ---------------------------
def build_vector_store(documents):
    """
    Converts documents into embeddings using a Hugging Face model and stores them in a FAISS vector store.
    """
    # Initialize the embedding model using a SentenceTransformer model (e.g., 'all-MiniLM-L6-v2')
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Build the FAISS index from the documents
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

# ---------------------------
# 4. Main Execution: Process, Build, Save, and Print FAISS Index Data
# ---------------------------
if __name__ == '__main__':
    # Process PDF file
    pdf_file_path = '/kaggle/input/iba-dataset-pdf/pa-2024-25.pdf'  # Replace with your PDF file path
    pdf_docs = process_pdf(pdf_file_path)
    
    # Process JSON file
    json_file_path = '/kaggle/input/iba-dataset-json/courses_info.json'  # Replace with your JSON file path
    json_docs = process_json(json_file_path)
    
    # Combine documents from both sources
    all_documents = pdf_docs + json_docs
    
    # Build the vector store for efficient retrieval
    vector_store = build_vector_store(all_documents)
    
    # Save the vector store locally (for example, in a directory named 'faiss_index')
    save_path = "faiss_index"
    vector_store.save_local(save_path)
    print(f"Vector store saved to '{save_path}'.")
    
    # ---------------------------
    # Print FAISS Index Details
    # ---------------------------
    print("\nFAISS Index Details:")
    # Print the FAISS index object
    print(vector_store.index)
    # Print the total number of vectors in the index
    print("Number of vectors in the index:", vector_store.index.ntotal)
    
    # ---------------------------
    # Print the Data Stored in the FAISS Index
    # ---------------------------
    print("\nDocuments stored in the FAISS index:")
    # The documents are stored in the docstore (as a dictionary)
    for doc_id, doc in vector_store.docstore._dict.items():
        print(f"Document ID: {doc_id}")
        print(f"Content: {doc.page_content}")
        print("=" * 80)


In [None]:
import faiss

# Path to the FAISS index file saved by LangChain
index_path = "faiss_index/index.faiss"

# Load the raw FAISS index
faiss_index = faiss.read_index(index_path)
print("Loaded FAISS index using faiss.read_index:")
print(faiss_index)


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the same embeddings model used when saving the index
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load the FAISS vector store from the saved directory
vector_store = FAISS.load_local("/kaggle/working/faiss_index", embeddings, allow_dangerous_deserialization=True)
print("Loaded vector store using FAISS.load_local:")
print("FAISS index object:", vector_store.index)
print("Number of vectors in the index:", vector_store.index.ntotal)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the same embeddings model used when saving the index
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load the FAISS vector store from the saved directory
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Print basic details about the FAISS index
print("FAISS index details:")
print(vector_store.index)
print("Number of vectors in the index:", vector_store.index.ntotal)

# Print the stored documents
print("\nDocuments stored in the FAISS vector store:")
for doc_id, doc in vector_store.docstore._dict.items():
    print(f"\nDocument ID: {doc_id}")
    print(f"Content: {doc.page_content}")
    print("=" * 80)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Define the model name and initialize the embeddings model
model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Load the FAISS vector store from the saved directory
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Print context information about the embedding model
print("Embedding Model Context:")
print(f"Model Name: {model_name}")
print("Description: 'all-MiniLM-L6-v2' is a SentenceTransformer model that provides efficient and effective text embeddings optimized for semantic similarity tasks.")
print("=" * 80)

# Define your query
query = "List down the courses tought by Dr. Anum Tariq"

# Run a similarity search to retrieve the top 3 matching documents
retrieved_docs = vector_store.similarity_search(query, k=3)

# Print the retrieved documents along with context
print("\nRetrieved Documents:")
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:\n{doc.page_content}\n{'-' * 80}")


In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import transformers

# ---------------------------
# 1. Load the FAISS Vector Store
# ---------------------------
model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
print(f"FAISS index loaded successfully. Number of vectors in the index: {vector_store.index.ntotal}")

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


FAISS index loaded successfully. Number of vectors in the index: 764


In [2]:

# ---------------------------
# 2. Query Handling & Similarity Search
# ---------------------------
# 2.1. Get a single text query from the user
query = input("Enter your query: ")

# 2.2. Embed the query and perform a unified similarity search across all stored documents
retrieved_docs = vector_store.similarity_search(query, k=5)

# Optional: Print the retrieved document snippets for debugging purposes
print("\nRetrieved Documents:")
for i, doc in enumerate(retrieved_docs):
    print(f"Document {i+1} snippet:\n{doc.page_content[:300]}\n{'-'*80}")

# Combine the retrieved document chunks to form a context string
context = "\n".join([doc.page_content for doc in retrieved_docs])


Enter your query:  list down the teacher teaching microeconomics



Retrieved Documents:
Document 1 snippet:
MICROECONOMICS SYED AHMED 14:30 TUE - THU 57 97067
--------------------------------------------------------------------------------
Document 2 snippet:
MICROECONOMICS Dr. Khadija Malik Bari 10:00 TUE - THU 40 96761
--------------------------------------------------------------------------------
Document 3 snippet:
MICROECONOMICS Dr. Khadija Malik Bari 11:30 TUE - THU 40 96794
--------------------------------------------------------------------------------
Document 4 snippet:
MICROECONOMICS Dr. Asad Bilal Rizvi 10:00 TUE - THU 56 96772
--------------------------------------------------------------------------------
Document 5 snippet:
PRINCIPLES OF MICROECONOMICS Sahar Arshad Mahmood 10:00 TUE - THU 55 96774
--------------------------------------------------------------------------------


In [3]:
from transformers import AutoModelForQuestionAnswering, AutoModelForCausalLM, AutoTokenizer, pipeline

In [4]:
import torch