              **Data Analyst Agent**
                              by Aditya Raj

In [1]:

# Install required packages
!pip install together python-docx pdfplumber pytesseract pandas matplotlib seaborn openpyxl pillow python-magic gradio -U -q
!sudo apt install tesseract-ocr libmagic-dev -qq

import gradio as gr
import base64
import io
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tempfile
from docx import Document
import pdfplumber
import pytesseract
from PIL import Image
import magic
from together import Together

# ======================
# Enhanced Data Analyst Agent with Universal File Processing
# ======================
class DataAnalystAgent:
    def __init__(self):
        self.df = None
        self.text_context = ""
        self.conversation_history = []
        self.last_visualization = None
        self.api_key = "Your_API_Key"
        self.model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
        self.client = Together(api_key=self.api_key)
        self.file_types = ['.txt', '.docx', '.pdf', '.csv', '.xlsx', '.xls', '.png', '.jpg', '.jpeg']

    def _call_llama(self, prompt):
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=2048,
                temperature=0.7,
                stream=False
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return f"Error: {str(e)}"

    def _extract_code(self, response):
        code_blocks = re.findall(r'```python(.*?)```', response, re.DOTALL)
        if not code_blocks:
            code_blocks = re.findall(r'```(.*?)```', response, re.DOTALL)
        return '\n'.join(code_blocks).strip() if code_blocks else None

    def _execute_code(self, code):
        img_base64 = None
        try:
            env = {'pd': pd, 'np': np, 'plt': plt, 'sns': sns, 'df': self.df}
            exec(code, env)
            if plt.gcf().get_axes():
                buf = io.BytesIO()
                plt.savefig(buf, format='png', bbox_inches='tight')
                plt.close()
                img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
                self.last_visualization = img_base64
            return img_base64, None
        except Exception as e:
            return None, f"Code execution error: {str(e)}"

    def process_file(self, file_input):
        """Universal file processor that handles both paths and file objects"""
        try:
            # Determine if input is a path string or file-like object
            if isinstance(file_input, str):
                # Input is a file path
                file_path = file_input
                filename = os.path.basename(file_path)
                ext = os.path.splitext(filename)[1].lower()

                if not os.path.exists(file_path):
                    return False, f"File not found: {file_path}"
            else:
                # Input is a file-like object (BytesIO)
                filename = file_input.name
                ext = os.path.splitext(filename)[1].lower()
                file_path = None

            if ext not in self.file_types:
                return False, "Unsupported file type"

            # Process based on file type
            if ext == '.txt':
                if file_path:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        self.text_context += f.read() + "\n\n"
                else:
                    self.text_context += file_input.read().decode('utf-8', errors='ignore') + "\n\n"

            elif ext == '.docx':
                if file_path:
                    doc = Document(file_path)
                else:
                    doc = Document(io.BytesIO(file_input.read()))
                self.text_context += '\n'.join([p.text for p in doc.paragraphs]) + "\n\n"

            elif ext == '.pdf':
                if file_path:
                    with pdfplumber.open(file_path) as pdf:
                        for page in pdf.pages:
                            text = page.extract_text()
                            if text:
                                self.text_context += text + "\n\n"
                else:
                    with pdfplumber.open(io.BytesIO(file_input.read())) as pdf:
                        for page in pdf.pages:
                            text = page.extract_text()
                            if text:
                                self.text_context += text + "\n\n"

            elif ext in ['.png', '.jpg', '.jpeg']:
                if file_path:
                    img = Image.open(file_path)
                else:
                    img = Image.open(io.BytesIO(file_input.read()))
                self.text_context += pytesseract.image_to_string(img) + "\n\n"

            elif ext == '.csv':
                if file_path:
                    self.df = pd.read_csv(file_path)
                else:
                    self.df = pd.read_csv(io.BytesIO(file_input.read()))

            elif ext in ['.xlsx', '.xls']:
                if file_path:
                    self.df = pd.read_excel(file_path)
                else:
                    self.df = pd.read_excel(io.BytesIO(file_input.read()))

            return True, "File processed successfully"
        except Exception as e:
            return False, f"Processing error: {str(e)}"

    def generate_response(self, user_query):
        context = ""
        if self.df is not None:
            context += f"Tabular data: {self.df.shape[0]} rows, {self.df.shape[1]} columns\n"
            context += f"Columns: {', '.join(self.df.columns)}\n"
        if self.text_context:
            context += f"Document content:\n{self.text_context[:5000]}{'...' if len(self.text_context) > 5000 else ''}\n\n"

        history = "\n".join([
            f"User: {h['user']}\nAssistant: {h['assistant']}"
            for h in self.conversation_history[-3:]
        ])

        prompt = f"""<s>[INST] <<SYS>>
You are a helpful data analyst assistant. You can:
1. Answer questions based on data and documents.
2. Generate Python code using pandas/matplotlib/seaborn when asked.
   Use 'df' as the dataframe.
Return code using ```python ``` blocks if needed.
<</SYS>>

Available context:
{context}

{history}
User: {user_query}
Assistant: [/INST]"""

        response = self._call_llama(prompt)
        code = self._extract_code(response)
        img_base64 = None
        if code:
            img_base64, error = self._execute_code(code)
            if error:
                response += f"\n\n{error}"
        self.conversation_history.append({'user': user_query, 'assistant': response})
        return response, img_base64

    def reset(self):
        self.df = None
        self.text_context = ""
        self.conversation_history = []
        self.last_visualization = None

# Initialize the agent
agent = DataAnalystAgent()

# ======================
# Robust Gradio UI Components
# ======================
def process_files(files):
    file_status = []
    df_preview = "<p>No tabular data available</p>"
    text_preview = ""

    for file_info in files:
        try:
            # Determine if we have a path or file-like object
            if isinstance(file_info, str):
                # Gradio passed a file path string
                success, message = agent.process_file(file_info)
                filename = os.path.basename(file_info)
            else:
                # Gradio passed a file-like object
                filename = file_info.name
                # Reset pointer position if needed
                if hasattr(file_info, 'seek'):
                    file_info.seek(0)
                success, message = agent.process_file(file_info)

            file_status.append({
                "filename": filename,
                "status": "✅ Success" if success else "❌ Failed",
                "message": message
            })
        except Exception as e:
            file_status.append({
                "filename": file_info.name if hasattr(file_info, 'name') else file_info,
                "status": "❌ Error",
                "message": f"Processing failed: {str(e)}"
            })

    # Prepare previews
    if agent.df is not None:
        df_preview = agent.df.head(5).to_html(index=False)
    if agent.text_context:
        text_preview = agent.text_context[:2000] + ("..." if len(agent.text_context) > 2000 else "")

    return file_status, df_preview, text_preview

def format_chat_message(message, is_user=False):
    icon = "👤" if is_user else "🤖"
    bubble_class = "user-bubble" if is_user else "bot-bubble"
    align_class = "user-message" if is_user else "bot-message"

    return f"""
    <div class="message {align_class}">
        <div class="message-bubble {bubble_class}">
            <div style="font-weight: 600; margin-bottom: 8px;">{icon} {'You' if is_user else 'Analyst'}</div>
            <div>{message}</div>
        </div>
    </div>
    """

def chat_response(user_input, chat_history_html):
    if not user_input.strip():
        return "", chat_history_html, None, ""

    # Get response from agent
    start_time = time.time()
    response, img_base64 = agent.generate_response(user_input)
    response_time = time.time() - start_time

    # Format new messages
    user_message_html = format_chat_message(user_input, is_user=True)
    bot_message_html = format_chat_message(response, is_user=False)

    # Append to existing history
    new_history = chat_history_html + user_message_html + bot_message_html

    # Convert base64 image to PIL for Gradio
    img = None
    if img_base64:
        img_bytes = base64.b64decode(img_base64)
        img = Image.open(io.BytesIO(img_bytes))

    return "", new_history, img, f"Generated in {response_time:.2f}s"
def reset_agent():
    agent.reset()
    welcome_message = """
    <div class="message bot-message">
        <div class="message-bubble bot-bubble">
            <div style="font-weight: 600; margin-bottom: 8px;">🤖 Analyst</div>
            <div>Agent has been reset. Hello! I'm your Data Analyst Assistant. Upload files and ask me anything about your data.</div>
        </div>
    </div>
    """
    return welcome_message, None, "<p>No tabular data available</p>", "", "Agent has been reset"

def download_visualization(img):
    if img is None:
        return None
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()

# Custom CSS for styling
custom_css = """
:root {
    --primary: #2563eb;
    --secondary: #7dd3fc;
    --accent: #fbbf24;
    --dark: #1e293b;
    --light: #f1f5f9;
}

body {
    background: linear-gradient(135deg, var(--light) 0%, #e0f2fe 100%);
    font-family: 'Segoe UI', system-ui, sans-serif;
}

.container {
    max-width: 1200px;
    margin: 0 auto;
    padding: 20px;
}

.header {
    background: linear-gradient(90deg, var(--primary) 0%, #3b82f6 100%);
    color: white;
    padding: 25px 30px;
    border-radius: 15px 15px 0 0;
    box-shadow: 0 4px 20px rgba(0,0,0,0.1);
    margin-bottom: 20px;
}

.header h1 {
    margin: 0;
    font-size: 2.2rem;
    display: flex;
    align-items: center;
    gap: 12px;
}

.header p {
    margin: 10px 0 0;
    opacity: 0.9;
    font-size: 1.1rem;
}

.tabs {
    background: white;
    border-radius: 12px;
    box-shadow: 0 6px 25px rgba(0,0,0,0.08);
    overflow: hidden;
    margin-bottom: 25px;
}

.tab-buttons {
    display: flex;
    background: var(--light);
    border-bottom: 1px solid #e2e8f0;
}

.tab-buttons button {
    flex: 1;
    padding: 18px;
    border: none;
    background: none;
    font-size: 1.1rem;
    font-weight: 600;
    cursor: pointer;
    transition: all 0.3s ease;
    color: var(--dark);
}

.tab-buttons button:hover {
    background: rgba(37, 99, 235, 0.1);
}

.tab-buttons button.selected {
    background: white;
    color: var(--primary);
    box-shadow: inset 0 -3px 0 var(--primary);
}

.tab-content {
    padding: 25px;
}

.preview-section {
    display: flex;
    gap: 25px;
    margin-bottom: 25px;
}

.preview-card {
    flex: 1;
    background: white;
    border-radius: 12px;
    padding: 20px;
    box-shadow: 0 4px 15px rgba(0,0,0,0.05);
}

.preview-card h3 {
    margin-top: 0;
    color: var(--primary);
    display: flex;
    align-items: center;
    gap: 8px;
}

.chat-container {
    background: white;
    border-radius: 12px;
    box-shadow: 0 6px 25px rgba(0,0,0,0.08);
    overflow: hidden;
    margin-bottom: 25px;
}

.chat-header {
    background: linear-gradient(90deg, var(--primary) 0%, #3b82f6 100%);
    color: white;
    padding: 18px 25px;
    font-size: 1.2rem;
    font-weight: 600;
}

.chat-history {
    height: 400px;
    padding: 20px;
    overflow-y: auto;
    background: #f8fafc;
    border-bottom: 1px solid #e2e8f0;
}

.message {
    margin-bottom: 20px;
    display: flex;
}

.user-message {
    justify-content: flex-end;
}

.bot-message {
    justify-content: flex-start;
}

.message-bubble {
    max-width: 80%;
    padding: 15px 20px;
    border-radius: 20px;
    line-height: 1.5;
}

.user-bubble {
    background: var(--primary);
    color: white !important;  /* Force white text */
    border-bottom-right-radius: 5px;
}

.bot-bubble {
    background: #e0f2fe;
    color: #1e293b !important;  /* Force dark text */
    border-bottom-left-radius: 5px;
}

/* Ensure text is visible in all elements */
.message-bubble div {
    color: inherit !important;
}

/* Fix for response time text */
#response-time {
    color: var(--dark) !important;
    background: #f8fafc;
    padding: 8px 12px;
    border-radius: 8px;
    border: 1px solid #cbd5e1;
}
.chat-input {
    padding: 20px;
    background: white;
    display: flex;
    gap: 12px;
}

.visualization-section {
    background: white;
    border-radius: 12px;
    padding: 25px;
    box-shadow: 0 6px 25px rgba(0,0,0,0.08);
}

.visualization-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 20px;
}

.visualization-header h3 {
    margin: 0;
    color: var(--primary);
    display: flex;
    align-items: center;
    gap: 8px;
}

.visualization-content {
    text-align: center;
    min-height: 300px;
    display: flex;
    justify-content: center;
    align-items: center;
    background: #f8fafc;
    border-radius: 10px;
    padding: 20px;
}

.controls {
    display: flex;
    justify-content: space-between;
    margin-top: 25px;
    padding-top: 20px;
    border-top: 1px solid #e2e8f0;
}

.controls button {
    padding: 12px 25px;
    border: none;
    border-radius: 8px;
    font-weight: 600;
    cursor: pointer;
    transition: all 0.3s ease;
    display: flex;
    align-items: center;
    gap: 8px;
}

.primary-btn {
    background: var(--primary);
    color: white;
}

.primary-btn:hover {
    background: #1d4ed8;
    transform: translateY(-2px);
    box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
}

.secondary-btn {
    background: var(--light);
    color: var(--dark);
}

.secondary-btn:hover {
    background: #e2e8f0;
}

#chat-input {
    flex: 1;
    padding: 15px 20px;
    border-radius: 12px;
    border: 1px solid #cbd5e1;
    font-size: 1rem;
    transition: border-color 0.3s;
}

#chat-input:focus {
    border-color: var(--primary);
    outline: none;
    box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.2);
}

#submit-btn {
    background: var(--primary);
    color: white;
    border: none;
    border-radius: 12px;
    padding: 0 25px;
    font-weight: 600;
    cursor: pointer;
    transition: all 0.3s;
}

#submit-btn:hover {
    background: #1d4ed8;
    transform: translateY(-2px);
}

#upload-area {
    border: 2px dashed #cbd5e1;
    border-radius: 12px;
    padding: 40px 20px;
    text-align: center;
    cursor: pointer;
    transition: all 0.3s;
    background: #f8fafc;
    margin-bottom: 20px;
}

#upload-area:hover {
    border-color: var(--primary);
    background: #f0f9ff;
}

.upload-icon {
    font-size: 48px;
    color: var(--primary);
    margin-bottom: 15px;
}

.file-preview {
    max-height: 200px;
    overflow-y: auto;
    margin-top: 15px;
    padding: 15px;
    background: #f1f5f9;
    border-radius: 8px;
    text-align: left;
}

.status-indicator {
    display: inline-block;
    width: 12px;
    height: 12px;
    border-radius: 50%;
    margin-right: 8px;
}

.status-active {
    background-color: #10b981;
}

.status-inactive {
    background-color: #ef4444;
}

@keyframes pulse {
    0% { transform: scale(1); }
    50% { transform: scale(1.05); }
    100% { transform: scale(1); }
}

.pulse {
    animation: pulse 2s infinite;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
    with gr.Column(elem_classes=["container"]):
        # Header section
        with gr.Column(elem_classes=["header"]):
            gr.Markdown("""
            <div style="display: flex; align-items: center; gap: 15px;">
                <h1><span>📊 Data Analyst Agent</span></h1>
            </div>
            <p>Upload documents, analyze data, and get insights through natural conversation</p>
            """)

        # File upload section
        with gr.Column(elem_classes=["tab-content"]):
            gr.Markdown("### 📁 Upload Files")
            file_upload = gr.File(
                label="Upload your documents",
                file_count="multiple",
                file_types=agent.file_types,
                type="filepath"  # Explicitly request file paths
            )

            with gr.Row(elem_classes=["preview-section"]):
                with gr.Column(elem_classes=["preview-card"]):
                    gr.Markdown("### 🔢 Tabular Data Preview")
                    df_preview = gr.HTML(label="Data Preview", value="<p>Upload files to see preview</p>")

                with gr.Column(elem_classes=["preview-card"]):
                    gr.Markdown("### 📝 Text Content Preview")
                    text_preview = gr.Textbox(label="Text Preview", lines=8, interactive=False, value="")

            gr.Markdown("### 📋 File Processing Status")
            file_status = gr.JSON(label="File Status", value=[])

        # Chat interface
        with gr.Column(elem_classes=["chat-container"]):
            gr.Markdown("### 💬 Chat with Data Analyst", elem_classes=["chat-header"])

            chat_history = gr.HTML(
    label="Conversation History",
    value="""
    <div class="message bot-message">
        <div class="message-bubble bot-bubble">
            <div style="font-weight: 600; margin-bottom: 8px;">🤖 Analyst</div>
            <div>Hello! I'm your Data Analyst Assistant. Upload files and ask me anything about your data.</div>
        </div>
    </div>
    """,
    elem_classes=["chat-history"]
)

            with gr.Row(elem_classes=["chat-input"]):
                user_input = gr.Textbox(
                    placeholder="Ask a question about your data...",
                    show_label=False,
                    elem_id="chat-input"
                )
                submit_btn = gr.Button("Send", elem_id="submit-btn")

            response_time = gr.Textbox(label="Response Time", interactive=False, value="", elem_id="response-time" )

        # Visualization section
        with gr.Column(elem_classes=["visualization-section"]):
            with gr.Column(elem_classes=["visualization-header"]):
                gr.Markdown("### 📊 Generated Visualizations")

            visualization = gr.Image(
                label="Latest Visualization",
                interactive=False,
                elem_classes=["visualization-content"],
                value=None
            )

            with gr.Row(elem_classes=["controls"]):
                reset_btn = gr.Button("Reset Agent", elem_classes=["secondary-btn"])
                download_btn = gr.Button("Download Visualization", elem_classes=["primary-btn"])

    # Event handlers
    file_upload.upload(
        process_files,
        inputs=[file_upload],
        outputs=[file_status, df_preview, text_preview]
    )

    submit_btn.click(
        chat_response,
        inputs=[user_input, chat_history],
        outputs=[user_input, chat_history, visualization, response_time]
    )

    user_input.submit(
        chat_response,
        inputs=[user_input, chat_history],
        outputs=[user_input, chat_history, visualization, response_time]
    )

    reset_btn.click(
        reset_agent,
        inputs=[],
        outputs=[chat_history, visualization, df_preview, text_preview, response_time]
    )

    download_btn.click(
        download_visualization,
        inputs=[visualization],
        outputs=gr.File(label="Download Visualization")
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━