In [2]:
# Install required libraries
! pip install streamlit PyMuPDF cohere pinecone-client datasets transformers sentence-transformers pdfplumber gradio PyPDF2






In [3]:
import streamlit as st
import fitz  # PyMuPDF for PDF processing
import cohere
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import pdfplumber
import os
from dotenv import load_dotenv
import gradio as gr
import PyPDF2

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\manas\AppData\Local\sagemaker\sagemaker\config.yaml


In [4]:
# Load environment variables
load_dotenv()

cohere_api_key = os.getenv("COHERE_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")  # Move Pinecone API key to .env file

In [5]:
# Initialize Pinecone and Cohere clients
pc = Pinecone(api_key=pinecone_api_key, environment="us-east-1")
co = cohere.Client(cohere_api_key)

In [6]:
# Define the index name
index_name = "sample-article"

# Create Pinecone index if it doesn't exist
if index_name not in pc.list_indexes().names():
    spec = ServerlessSpec(cloud="aws", region="us-east-1")
    pc.create_index(name=index_name, dimension=384, metric="cosine", spec=spec)

# Connect to the index
index = pc.Index(index_name)

In [7]:
# Load pre-trained SentenceTransformer for embedding generation
embedder = SentenceTransformer('all-MiniLM-L6-v2')



In [8]:

def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + " "
        print(f"Successfully extracted {len(text)} characters from PDF")
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
    return text

In [9]:
def process_document(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    chunks = [text[i:i+500] for i in range(0, len(text), 500)]
    print(f"Created {len(chunks)} chunks from the document")
    
    embeddings = embedder.encode(chunks)
    print(f"Generated embeddings for {len(embeddings)} chunks")
    
    try:
        vectors_to_upsert = [(f"chunk_{i}", emb.tolist(), {"text": chunk}) for i, (emb, chunk) in enumerate(zip(embeddings, chunks))]
        index.upsert(vectors=vectors_to_upsert)
        print(f"Successfully uploaded {len(embeddings)} embeddings to Pinecone")
    except Exception as e:
        print(f"Error uploading embeddings to Pinecone: {str(e)}")
    
    return chunks

In [10]:
def retrieve_relevant_chunks(query, top_k=3):
    query_embedding = embedder.encode([query])[0]
    try:
        query_results = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
        print(f"Retrieved {len(query_results['matches'])} relevant chunks")
        return [match['metadata']['text'] for match in query_results['matches'] if 'metadata' in match and 'text' in match['metadata']]
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")
        return []

In [16]:
def generate_answer(question, context):
    try:
        response = co.generate(
            model='command',
            prompt=f"Context: {context}\n\nQuestion: {question}\n\nAnswer:",
            max_tokens=100
        )
        print("Successfully generated answer using Cohere")
        return response.generations[0].text
    except Exception as e:
        print(f"Error generating answer with Cohere: {str(e)}")
        return "Sorry, I couldn't generate an answer at this time."

In [17]:
def qa_bot(pdf_file, question):
    if pdf_file is None:
        return "Please upload a PDF file."
    
    document_chunks = process_document(pdf_file.name)  # Use pdf_file.name to get the file path
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)
    answer = generate_answer(question, context)
    return answer

In [18]:
# Gradio interface
iface = gr.Interface(
    fn=qa_bot,
    inputs=[
        gr.File(label="Upload PDF Document", file_types=[".pdf"]),
        gr.Textbox(label="Ask a Question")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Interactive QA Bot",
    description="Upload a PDF document and ask a question based on its contents."
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




Successfully extracted 2341 characters from PDF
Created 5 chunks from the document
Generated embeddings for 5 chunks
Successfully uploaded 5 embeddings to Pinecone
Retrieved 3 relevant chunks
Successfully generated answer using Cohere
Successfully extracted 2341 characters from PDF
Created 5 chunks from the document
Generated embeddings for 5 chunks
Successfully uploaded 5 embeddings to Pinecone
Retrieved 3 relevant chunks
Successfully generated answer using Cohere
Successfully extracted 10733 characters from PDF
Created 22 chunks from the document
Generated embeddings for 22 chunks
Successfully uploaded 22 embeddings to Pinecone
Retrieved 3 relevant chunks
Successfully generated answer using Cohere
Created dataset file at: .gradio\flagged\dataset2.csv
Successfully extracted 10733 characters from PDF
Created 22 chunks from the document
Generated embeddings for 22 chunks
Successfully uploaded 22 embeddings to Pinecone
Retrieved 3 relevant chunks
Successfully generated answer using Coher