In [1]:
pip install pinecone groq sentence-transformers pdfminer.six pdf2image pytesseract langchain pillow

Collecting pinecone
  Downloading pinecone-7.0.2-py3-none-any.whl.metadata (9.5 kB)
Collecting groq
  Downloading groq-0.26.0-py3-none-any.whl.metadata (15 kB)
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl.metadata (27 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformer

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from groq import Groq
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PIL import Image

# Set environment variables
os.environ["GROQ_API_KEY"] = ""
os.environ["PINECONE_API_KEY"] = ""

groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

indexes = pc.list_indexes().names()
if indexes:
    index_name = indexes[0]
else:
    pc.create_index(
        name="chatbot",
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    index_name = "chatbot"

index = pc.Index(name=index_name)
model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")

def extract_text_from_pdf(file_path):
    try:
        text = extract_text(file_path)
        return text.strip()
    except Exception:
        images = convert_from_path(file_path)
        ocr_text = ""
        for img in images:
            ocr_text += pytesseract.image_to_string(img)
        return ocr_text.strip()

def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(text)
    return chunks

def generate_embedding(text):
    embedding = model.encode([text])[0]
    return embedding.tolist()

def ingest_file(file_path):
    print("📄 Extracting text from file...")
    text = extract_text_from_pdf(file_path)
    print("✂️ Splitting text into chunks...")
    chunks = chunk_text(text)
    print(f"📦 Indexing {len(chunks)} chunks into Pinecone...")
    for chunk in chunks:
        vector = generate_embedding(chunk)
        index.upsert([{
            "id": str(hash(chunk)),
            "values": vector,
            "metadata": {"text": chunk}
        }])
    print("✅ File indexed successfully!\n")

def search_pinecone(query_vector):
    results = index.query(vector=query_vector, top_k=3, include_metadata=True)
    return [match["metadata"]["text"] for match in results["matches"]]

def generate_response(prompt):
    chat_completion = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        model="llama3-8b-8192"
    )
    return chat_completion.choices[0].message.content

def chatbot(use_indexed_context=True):
    print("\n💬 Chatbot ready! Type 'exit' to quit.\n")
    while True:
        query = input("🧑 You: ")
        if query.lower() == "exit":
            print("👋 Goodbye!")
            break

        if use_indexed_context:
            query_embedding = generate_embedding(query)
            contexts = search_pinecone(query_embedding)
            if not contexts:
                print("⚠️ No context found in Pinecone. Switching to direct chat.\n")
                use_indexed_context = False
                prompt = query
            else:
                prompt = "\n".join(contexts) + f"\nUser: {query}"
        else:
            prompt = query

        response = generate_response(prompt)
        print("🤖 Bot:", response)

def main():
    print("🤖 Welcome to the Chatbot Assistant!\n")
    print("🔘 Select an option:")
    print("1. 📄 Upload file and chat using its content")
    print("2. 💬 Chat directly without file")
    print("3. ❌ Exit")

    choice = input("Enter your choice (1/2/3): ").strip()

    if choice == "1":
        file_path = input("📂 Enter path to PDF or image file: ").strip()
        if os.path.exists(file_path):
            ingest_file(file_path)
            chatbot(use_indexed_context=True)
        else:
            print("❌ File not found. Returning to menu.\n")
            main()
    elif choice == "2":
        chatbot(use_indexed_context=False)
    elif choice == "3":
        print("👋 Exiting...")
    else:
        print("⚠️ Invalid choice. Try again.\n")
        main()

if __name__ == "__main__":
    main()