<a href="https://colab.research.google.com/github/Jayzilva/API-asp.net-core-controllers/blob/main/multi_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
! pip install groq
! pip install python-dotenv
! pip install python-pptx
! pip install pydantic
! pip install pymupdf
!pip install gradio

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Col

In [5]:
import os
import json
import re
from dotenv import load_dotenv
from groq import Groq
from pptx import Presentation
from pydantic import BaseModel, Field, validator
from typing import Optional, Union, Dict, Any
import gradio as gr
import fitz

# Load environment variables from .env file
load_dotenv()

# Define the BusinessTermSheet model using Pydantic
class BusinessTermSheet(BaseModel):
    asset_class: Optional[str] = Field(None, alias="Asset Class")
    website: Optional[str] = Field(None, alias="Website")
    primary_impact: Optional[str] = Field(None, alias="Primary Impact")
    un_sustainable_development_goal: Optional[str] = Field(None, alias="UN Sustainable Development Goal")
    current_fund: Optional[str] = Field(None, alias="Current Fund")
    important_dates: Optional[Union[str, list]] = Field(None, alias="Important Dates")
    target_irr: Optional[str] = Field(None, alias="Target IRR")
    term: Optional[str] = Field(None, alias="Term")
    fund_domicile: Optional[str] = Field(None, alias="Fund Domicile")
    target_fund_size: Optional[str] = Field(None, alias="Target Fund Size")
    firm_strategy_overview: Optional[str] = Field(None, alias="Firm & Strategy Overview")
    total_firm_aum: Optional[str] = Field(None, alias="Total Firm AUM")
    strategy_differentiator: Optional[Union[str, Dict[str, Any]]] = Field(None, alias="Strategy Differentiator")
    key_financial_data: Optional[Union[str, Dict[str, Any]]] = Field(None, alias="Key Financial Data")
    investment_team_info: Optional[Union[str, Dict[str, Any]]] = Field(None, alias="Investment Team Size & Information")
    contact_details: Optional[Union[str, Dict[str, Any]]] = Field(None, alias="Contact Details")

    # Validator to ensure certain fields are always returned as strings
    @validator("important_dates", "investment_team_info", pre=True, always=True)
    def ensure_string(cls, v):
        if isinstance(v, list):
            return ", ".join(v)
        elif isinstance(v, dict):
            return ", ".join([f"{key}: {value}" for key, value in v.items()])
        return v

# Extract text from PowerPoint (.pptx) files
def extract_text_from_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    text = ""
    for slide in presentation.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# Extract text from PDF files
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text() + "\n"
    return text

# Parse the raw text response into a dictionary
def parse_response_to_dict(response_text):
    data = {}
    lines = response_text.splitlines()
    for line in lines:
        if ": " in line:
            key, value = line.split(": ", 1)
            data[key.strip()] = value.strip() or None
    return data

# Analyze the presentation file, extract details, and return results in JSON format
def analyze_presentation(file_path, file_type, model_name):
    # Extract text based on file type
    if file_type == "pptx":
        text_content = extract_text_from_pptx(file_path)
    elif file_type == "pdf":
        text_content = extract_text_from_pdf(file_path)
    else:
        return {"Error": "Unsupported file type"}

    # Initialize Groq client and make API call
    client = Groq(api_key=('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'))
    stream = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are the reader of the presentation and analyzer."
            },
            {
                "role": "user",
                "content": f"""Extract only the following details from the presentation content related to a business term sheet:
                - Asset Class
                - Website
                - Primary Impact
                - UN Sustainable Development Goal
                - Current Fund
                - Important Dates
                - Target IRR
                - Term
                - Fund Domicile
                - Target Fund Size
                - Firm & Strategy Overview
                - Total Firm AUM
                - Strategy Differentiator (Provide information on: Team, deal sourcing, due diligence, underwriting experience, execution, servicing)
                - Key Financial Data (prior Track Record from other funds or prior fund)
                - Investment Team Size & Information
                - Contact Details

                Here is the presentation content. Return the result only in JSON format: {text_content}"""
            }
        ],
        model=model_name,
        temperature=0.2,
        max_tokens=1000,
        top_p=0.5,
        frequency_penalty=0.1,
        presence_penalty=0.1,
        stop=None,
        stream=True,
    )

    full_response = ""
    try:
        for chunk in stream:
            if chunk.choices and chunk.choices[0].delta:
                content = chunk.choices[0].delta.content
                if content is not None:
                    full_response += content

        match = re.search(r"\{.*\}", full_response.strip(), re.DOTALL)
        if match:
            json_response = json.loads(match.group(0))
            structured_data = BusinessTermSheet(**json_response)
            full_data = structured_data.dict()

            # Filter out keys with null values
            filtered_data = {k: v for k, v in full_data.items() if v is not None}
            return filtered_data
        else:
            # Fallback to plain text parsing if JSON not found
            fallback_data = parse_response_to_dict(full_response)
            structured_data = BusinessTermSheet(**fallback_data)
            full_data = structured_data.dict()

            # Filter out keys with null values
            filtered_data = {k: v for k, v in full_data.items() if v is not None}
            return filtered_data
    except json.JSONDecodeError:
        # Handle JSON decode error with fallback parsing
        fallback_data = parse_response_to_dict(full_response)
        filtered_fallback_data = {k: v for k, v in fallback_data.items() if v is not None}
        return {"Error": "JSON data not found in response; used plain text parsing", "Data": filtered_fallback_data}
    except Exception as e:
        print(f"Error occurred while analyzing presentation: {e}")
        return {"Error": str(e), "Full Response": full_response}

# Compare outputs from two models based on similarity
def compare_outputs(output1, output2):
    matching_fields = sum(1 for k in output1 if k in output2 and output1[k] == output2[k])
    total_fields = max(len(output1), len(output2))
    similarity_score = (matching_fields / total_fields) * 100 if total_fields > 0 else 0
    return similarity_score

# Process multiple presentations and compare outputs from two models
def process_presentations(files, model_name1, model_name2):
    combined_results = {}
    for file in files:
        file_type = file.name.split('.')[-1].lower()
        if file_type in ["pptx", "pdf"]:
            # Analyze the presentation using two different models
            file_results1 = analyze_presentation(file.name, file_type, model_name1)
            file_results2 = analyze_presentation(file.name, file_type, model_name2)

            # Compare the outputs of the two models
            similarity_score = compare_outputs(file_results1, file_results2)

            combined_results[file.name] = {
                "Model 1 Output": file_results1,
                "Model 2 Output": file_results2,
                "Similarity Score (%)": similarity_score
            }
        else:
            combined_results[file.name] = {"Error": "Unsupported file type. Please upload a .pptx or .pdf file"}

    return combined_results

# Create Gradio interface for uploading files and selecting models
iface = gr.Interface(
    fn=process_presentations,
    inputs=[
        gr.Files(label="Upload your .pptx or .pdf files"),
        gr.Dropdown(
            label="Select Primary Model",
            choices=[
                "llama3-groq-8b-8192-tool-use-preview",
                "llama-3.2-3b-preview",
                "llama-3.2-90b-vision-preview"
            ],
            value="llama-3.2-90b-vision-preview"
        ),
        gr.Dropdown(
            label="Select Secondary Model",
            choices=[
                "llama3-groq-8b-8192-tool-use-preview",
                "llama-3.2-3b-preview",
                "llama-3.2-90b-vision-preview"
            ],
            value="llama3-groq-8b-8192-tool-use-preview"
        )
    ],
    outputs="json",
    title="Business Term Sheet Analyzer with Model Comparison",
    description="Upload PowerPoint (.pptx) or PDF presentation files to extract business term sheet details and compare outputs between two models."
)

# Launch Gradio interface
iface.launch()


<ipython-input-5-87382674b466>:35: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  @validator("important_dates", "investment_team_info", pre=True, always=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dea99ee215da3e865a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


