In [1]:
import gradio as gr
import os, re
from collections import Counter
from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import openai
from PyPDF2 import PdfReader
import spacy
import traceback

  import multipart


In [3]:
# Initialize OpenAI API key
def init_openai(api_key):
    openai.api_key = api_key
    print("OpenAI API key initialized")

# Load and extract text from PDF
def load_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    total_pages = len(reader.pages)
    return text, total_pages


In [5]:
# Split text into chunks
def split_text(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Build QA system using ChromaDB
def build_qa_system(documents):
    db = chromadb.PersistentClient(path="chroma_store/")
    chroma_collection = db.get_or_create_collection("my_chroma_store")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context)
    return index

# Calculate PDF statistics
def calculate_pdf_statistics(text, total_pages):
    words = re.findall(r'\w+', text)
    total_words = len(words)
    total_characters = len(text)
    avg_words_per_page = total_words / total_pages if total_pages > 0 else 0
    avg_characters_per_page = total_characters / total_pages if total_pages > 0 else 0
    statistics = {
        "Total Pages": total_pages,
        "Total Words": total_words,
        "Total Characters": total_characters,
        "Average Words per Page": avg_words_per_page,
        "Average Characters per Page": avg_characters_per_page,
    }
    return statistics

# Recognize entities using spaCy
def recognize_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Answer query using the QA system
def answer_query(qa_index, query_text):
    query_engine = qa_index.as_query_engine()
    response = query_engine.query(query_text)
    return response.response

# Gradio App Logic
def process_pdf(pdf_file, api_key):
    try:
        init_openai(api_key)
        text, total_pages = load_pdf(pdf_file)
        text_chunks = split_text(text)
        documents = [Document(text=chunk) for chunk in text_chunks]
        qa_index = build_qa_system(documents)
        statistics = calculate_pdf_statistics(text, total_pages)
        return qa_index, statistics, text, None
    except Exception as e:
        error_msg = traceback.format_exc()
        return None, None, None, f"Error processing PDF: {str(e)}\n{error_msg}"

def query_pdf(qa_index, query_text):
    try:
        if not qa_index:
            return None, "QA index is not initialized."
        return answer_query(qa_index, query_text), None
    except Exception as e:
        error_msg = traceback.format_exc()
        return None, f"Error answering query: {str(e)}\n{error_msg}"

def main(pdf_file, api_key, query_text, history, seq_num):
    try:
        qa_index, statistics, extracted_text, process_error = process_pdf(pdf_file, api_key)
        if process_error:
            return None, None, None, process_error, history, seq_num
        
        answer, query_error = query_pdf(qa_index, query_text)
        entities = recognize_entities(extracted_text) if extracted_text else []
        
        if not query_error:
            history.append(f"{seq_num}. Q: {query_text}")
            history.append(f"   A: {answer}")
            history.append("") 
        
        history_display = "\n".join(history)
        return statistics, answer, entities, query_error, history_display, seq_num + 1
    except Exception as e:
        error_msg = traceback.format_exc()
        return None, None, None, f"Error in main process: {str(e)}\n{error_msg}", "\n".join(history), seq_num

In [7]:
# Gradio Interface
with gr.Blocks(css=None) as demo:
    gr.Markdown("# üåê **Nestl√© HR Policy Chatbot**", elem_id="main-heading")

    with gr.Row():
        with gr.Column(scale=2):
            pdf_file_input = gr.File(label="üìÑ Upload Nestl√© HR Policy PDF", type="filepath")
        with gr.Column(scale=1):
            api_key_input = gr.Textbox(label="üîë Enter your OpenAI API key", type="password", placeholder="Your API Key here")

    query_input = gr.Textbox(label="‚ùì Ask a question about Nestle's HR policy", placeholder="Your question here", lines=3)

    query_button = gr.Button("Submit", elem_id="submit-btn")

    with gr.Row():
        with gr.Column(scale=1):
            pdf_statistics_output = gr.JSON(label="üìä PDF Statistics", elem_id="pdf-statistics")
        with gr.Column(scale=1):
            query_output = gr.Textbox(label="üìã Answer", lines=3, elem_id="answer-box")

    with gr.Row():
        entities_output = gr.JSON(label="üìÑ Recognized Entities", elem_id="entities-box")
        history_output = gr.Textbox(label="üìù Conversation History", lines=10, elem_id="history-box")

    error_output = gr.Textbox(label="‚ö†Ô∏è Error Log", lines=6, elem_id="error-log")

    history_state = gr.State([])
    seq_num_state = gr.State(1)

    def on_submit(pdf_file, api_key, query_text, history, seq_num):
        statistics, answer, entities, error_log, history_display, new_seq_num = main(pdf_file, api_key, query_text, history, seq_num)
        if error_log:
            return {}, "", {}, error_log, history_display, new_seq_num
        return statistics, answer, entities, "", history_display, new_seq_num

    query_button.click(on_submit, 
                       inputs=[pdf_file_input, api_key_input, query_input, history_state, seq_num_state], 
                       outputs=[pdf_statistics_output, query_output, entities_output, error_output, history_output, seq_num_state])

demo.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [9]:
pip install xelatex



SyntaxError: invalid syntax (2435724574.py, line 1)

In [11]:
codeo install xelatex


SyntaxError: invalid syntax (4080117223.py, line 1)