## PDF Summarizer, Key Elements Extractor & Chatbot


In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import fitz  # PyMuPDF
import cohere
import io
import base64
import webbrowser
from threading import Timer

# Initialize Cohere client with the provided API key
cohere_api_key = "your_cohere_api_key_here"
co = cohere.Client(cohere_api_key)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
        if len(pdf_document) == 0:
            return "No pages found in the PDF."
        
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        if not text.strip():
            return "PDF does not contain readable text."
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"
    return text

# Function to get Cohere summary of topics with text chunking
def get_topic_summary(text):
    try:
        # Cohere model's maximum token limit (approx. 4,000 tokens)
        MAX_TOKENS = 4000
        chunk_size = MAX_TOKENS - 100  # Leave some buffer for prompt

        # Split text into chunks
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        
        summaries = []
        for chunk in chunks:
            prompt = (
                "You are a topic summarization bot. Identify and summarize the main topics present in the following text:\n\n"
                f"{chunk}\n\nTopics and Summary:"
            )
            response = co.generate(
                model='command-xlarge',
                prompt=prompt,
                max_tokens=300,
                temperature=0.7
            )
            summaries.append(response.generations[0].text.strip())
        
        return "\n".join(summaries)
    except Exception as e:
        return f"Error generating topic summary: {str(e)}"

# Function to list key elements or topics with text chunking
def list_key_elements(text):
    try:
        # Cohere model's maximum token limit (approx. 4,000 tokens)
        MAX_TOKENS = 4000
        chunk_size = MAX_TOKENS - 100  # Leave some buffer for prompt

        # Split text into chunks
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        
        key_elements = []
        for chunk in chunks:
            prompt = (
                "You are a key elements extraction bot. Identify and list the main elements or key points in the following text:\n\n"
                f"{chunk}\n\nKey Elements:"
            )
            response = co.generate(
                model='command-xlarge',
                prompt=prompt,
                max_tokens=300,
                temperature=0.7
            )
            key_elements.append(response.generations[0].text.strip())
        
        return "\n".join(key_elements)
    except Exception as e:
        return f"Error extracting key elements: {str(e)}"

# Function to answer questions about the PDF with text chunking
def answer_question(text, question):
    try:
        # Cohere model's maximum token limit (approx. 4,000 tokens)
        MAX_TOKENS = 4000
        chunk_size = MAX_TOKENS - 100  # Leave some buffer for prompt

        # Split text into chunks
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        
        answers = []
        for chunk in chunks:
            prompt = (
                f"You are a knowledgeable assistant. Answer the following question based on the provided text:\n\n"
                f"Text:\n{chunk}\n\n"
                f"Question: {question}\n\nAnswer:"
            )
            response = co.generate(
                model='command-xlarge',
                prompt=prompt,
                max_tokens=300,
                temperature=0.7
            )
            answers.append(response.generations[0].text.strip())
        
        return "\n".join(answers)
    except Exception as e:
        return f"Error answering the question: {str(e)}"

# Initialize Dash app
app = dash.Dash(__name__)

# App layout
app.layout = html.Div(style={
    'backgroundColor': '#1e1e1e',  # Dark background
    'color': '#e0e0e0',  # Light text color
    'fontFamily': 'Arial, sans-serif',
    'padding': '30px',
    'display': 'flex',
    'flexDirection': 'column',
    'alignItems': 'center',
    'height': '100vh',
    'overflow': 'auto'
}, children=[
    html.H1("PDF Summarizer, Key Elements Extractor & Chatbot", style={
        'textAlign': 'center',
        'color': '#ffffff',
        'marginBottom': '20px',
        'fontSize': '42px',
        'fontWeight': '700'
    }),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            html.Button('Upload PDF', id='upload-btn', style={
                'padding': '15px 30px',
                'backgroundColor': '#007bff',
                'color': 'white',
                'border': 'none',
                'borderRadius': '8px',
                'cursor': 'pointer',
                'fontSize': '18px',
                'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.3)',
                'transition': 'background-color 0.3s',
                'textTransform': 'uppercase'
            }),
            html.P("Drag and drop or click to upload a PDF file.", style={
                'marginTop': '10px',
                'fontSize': '16px',
                'color': '#b0b0b0',
                'fontStyle': 'italic'
            })
        ]),
        style={
            'width': '100%',
            'maxWidth': '700px',
            'border': '2px dashed #007bff',
            'borderRadius': '10px',
            'padding': '30px',
            'textAlign': 'center',
            'backgroundColor': '#2d2d2d',
            'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.2)'
        },
        multiple=False
    ),
    html.Button('Extract Key Elements', id='extract-btn', style={
        'padding': '15px 30px',
        'backgroundColor': '#28a745',
        'color': 'white',
        'border': 'none',
        'borderRadius': '8px',
        'cursor': 'pointer',
        'fontSize': '18px',
        'marginTop': '20px',
        'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.3)',
        'transition': 'background-color 0.3s',
        'textTransform': 'uppercase'
    }),
    html.Div(id='summary-output', style={
        'marginTop': '30px',
        'fontSize': '18px',
        'maxWidth': '700px',
        'textAlign': 'left',
        'backgroundColor': '#2a2a2a',
        'padding': '20px',
        'borderRadius': '10px',
        'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.2)',
        'color': '#e0e0e0',
        'whiteSpace': 'pre-wrap'
    }),
    html.Div(id='key-elements-output', style={
        'marginTop': '30px',
        'fontSize': '18px',
        'maxWidth': '700px',
        'textAlign': 'left',
        'backgroundColor': '#2a2a2a',
        'padding': '20px',
        'borderRadius': '10px',
        'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.2)',
        'color': '#e0e0e0',
        'whiteSpace': 'pre-wrap'
    }),
    html.Div(style={
        'marginTop': '30px',
        'maxWidth': '700px',
        'width': '100%',
        'backgroundColor': '#2a2a2a',
        'padding': '20px',
        'borderRadius': '10px',
        'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.2)',
        'color': '#e0e0e0'
    }, children=[
        html.H3("Ask a Question About the PDF:", style={'textAlign': 'center', 'color': '#ffffff'}),
        dcc.Input(id='question-input', type='text', placeholder="Type your question here...", style={
            'width': '100%',
            'padding': '10px',
            'border': 'none',
            'borderRadius': '5px',
            'marginBottom': '10px',
            'fontSize': '16px'
        }),
        html.Button('Submit Question', id='question-btn', style={
            'padding': '10px 20px',
            'backgroundColor': '#007bff',
            'color': 'white',
            'border': 'none',
            'borderRadius': '5px',
            'cursor': 'pointer',
            'fontSize': '16px',
            'boxShadow': '0 4px 12px rgba(0, 0, 0, 0.3)',
            'transition': 'background-color 0.3s',
            'textTransform': 'uppercase'
        }),
        html.Div(id='question-output', style={
            'marginTop': '20px',
            'fontSize': '18px',
            'color': '#e0e0e0',
            'backgroundColor': '#333333',
            'padding': '15px',
            'borderRadius': '5px',
            'whiteSpace': 'pre-wrap'
        })
    ]),
])

# Callback to process uploaded PDF and summarize topics and key elements
@app.callback(
    [Output('summary-output', 'children'),
     Output('key-elements-output', 'children')],
    [Input('upload-data', 'contents'),
     Input('extract-btn', 'n_clicks')]
)
def update_output(contents, n_clicks):
    if contents:
        try:
            # Extract file content
            content_type, content_string = contents.split(',')
            decoded = base64.b64decode(content_string)
            
            # Extract text from PDF
            text = extract_text_from_pdf(io.BytesIO(decoded))
            
            if "Error" in text:
                return html.Div(text), html.Div()
            
            # Get topic summary
            summary = get_topic_summary(text)
            
            # Get key elements
            key_elements = list_key_elements(text)
            
            return html.Div([
                html.H3("Topic Summary:", style={'textAlign': 'center', 'color': '#ffffff'}),
                html.Pre(summary, style={'whiteSpace': 'pre-wrap', 'fontSize': '16px'})
            ]), html.Div([
                html.H3("Key Elements:", style={'textAlign': 'center', 'color': '#ffffff'}),
                html.Pre(key_elements, style={'whiteSpace': 'pre-wrap', 'fontSize': '16px'})
            ])
        except Exception as e:
            return html.Div(f"Error processing file: {str(e)}"), html.Div()
    return html.Div("Upload a PDF file to get started."), html.Div()

# Callback to handle questions about the PDF
@app.callback(
    Output('question-output', 'children'),
    [Input('question-btn', 'n_clicks')],
    [dash.dependencies.State('question-input', 'value'),
     dash.dependencies.State('upload-data', 'contents')]
)
def answer_question_callback(n_clicks, question, contents):
    if n_clicks and question:
        if contents:
            try:
                # Extract file content
                content_type, content_string = contents.split(',')
                decoded = base64.b64decode(content_string)
                
                # Extract text from PDF
                text = extract_text_from_pdf(io.BytesIO(decoded))
                
                if "Error" in text:
                    return f"Error: {text}"
                
                # Get answer to the question
                answer = answer_question(text, question)
                
                return answer
            except Exception as e:
                return f"Error processing the question: {str(e)}"
        else:
            return "Please upload a PDF file first."
    return ""

# Function to open the browser with the correct port
def open_browser(port):
    webbrowser.open_new(f'http://127.0.0.1:{port}/')

# Run the app
if __name__ == '__main__':
    port = 8057  # Set the port number
    Timer(1, open_browser, [port]).start()  # Pass the port to the open_browser function
    app.run_server(debug=False, use_reloader=False, port=port)


## Requirments

In [3]:
pip install dash PyMuPDF cohere


Note: you may need to restart the kernel to use updated packages.
