# Car report analysis with GPT4-Vision & Azure AI enhancements

GPT-4 Turbo with Vision provides **exclusive access to Azure AI Services tailored enhancements**. When combined with Azure AI Vision, it enhances your chat experience by providing the chat model with more detailed information about visible text in the image and the locations of objects.

- The **Optical Character Recognition (OCR) integration** allows the model to produce higher quality responses for dense text, transformed images, and number-heavy financial documents. It also covers a wider range of languages.

- The **object grounding integration** brings a new layer to data analysis and user interaction, as the feature can visually distinguish and highlight important elements in the images it processes.

https://learn.microsoft.com/en-us/azure/ai-services/openai/gpt-v-quickstart?tabs=enhanced&pivots=rest-api

<img src="screenshot.png">

In [2]:
import base64
import datetime
import glob
import gradio as gr
import json
import openai
import os
import requests
import sys

from io import BytesIO
from PIL import Image

In [None]:
def check_openai_version():
    """
    Check Azure Open AI version
    """
    installed_version = openai.__version__

    try:
        version_number = float(installed_version[:3])
    except ValueError:
        print("Invalid OpenAI version format")
        return

    print(f"Installed OpenAI version: {installed_version}")

    if version_number < 1.0:
        print("[Warning] You should upgrade OpenAI to have version >= 1.0.0")
        print("To upgrade, run: %pip install openai --upgrade")
    else:
        print(f"[OK] OpenAI version {installed_version} is >= 1.0.0")

In [None]:
check_openai_version()

In [None]:
sys.version

In [None]:
print(f"Today is {datetime.datetime.today().strftime('%d-%b-%Y %H:%M:%S')}")

In [None]:
print(f"Python version: {sys.version}")

## Azure AI services

In [None]:
print(f"OpenAI version: {openai.__version__}")

In [None]:
# Azure Open AI
openai.api_type: str = "azure"
openai.api_key = "4ac018829faa4e2dac1142aaddb52425"
openai.api_base = "https://aoimfuccilosw.openai.azure.com"

# Azure AI Vision (aka Azure Computer Vision)
azure_aivision_endpoint = "https://aoivisionmfuccilo.cognitiveservices.azure.com/"
azure_aivision_key = "f3bcebd78a2c43d5bfa5b2c43cad5f37"

In [None]:
indexname = "car-reports-tests"

In [None]:
model = "GPT4V"  # This is the deployed name of your GPT4 Vision model from the Azure Open AI studio

## Document

In [1]:
image_file = "car_report.jpg"

!ls $image_file -lh

-rw-r--r-- 1 vscode vscode 2.3M Dec 20 13:18 car_report.jpg


In [None]:
img = Image.open(image_file)
img.resize((640, 640))

## Function

In [None]:
def GPT4V_with_AzureAIVision(image_file, prompt):
    """
    GPT-4 Turbo with vision and Azure AI enhancements
    """
    # Testing if image file exists
    if not os.path.exists(image_file):
        print(f"[Error] Image file {image_file} does not exist.")

    # Endpoint
    base_url = f"{openai.api_base}/openai/deployments/{model}"
    gpt4vision_endpoint = (
        f"{base_url}/extensions/chat/completions?api-version=2023-12-01-preview"
    )

    # Header
    headers = {"Content-Type": "application/json", "api-key": openai.api_key}

    # Encoded image
    base_64_encoded_image = base64.b64encode(open(image_file, "rb").read()).decode(
        "ascii"
    )

    # Context
    context = """
You are an insurance AI expert. You will analyse a car report document. 
Always reply in English.
"""

    # Payload
    json_data = {
        "model": "gpt-4-vision-preview",
        "enhancements": {"ocr": {"enabled": True}, "grounding": {"enabled": True}},
        "dataSources": [
            {
                "type": "AzureComputerVision",
                "endpoint": azure_aivision_endpoint,
                "key": azure_aivision_key,
                "indexName": indexname,
            }
        ],
        "messages": [
            {"role": "system", "content": context},
            {"role": "user", "content": [prompt, {"image": base_64_encoded_image}]},
        ],
        "max_tokens": 4000,
        "temperature": 0.7,
        "top_p": 1,
    }

    # Response
    response = requests.post(
        gpt4vision_endpoint, headers=headers, data=json.dumps(json_data)
    )

    # Testing the status code from the model response
    if response.status_code == 200:
        now = str(datetime.datetime.today().strftime("%d-%b-%Y %H:%M:%S"))
        print(f"Analysis of image: {image_file}")
        results = json.loads(response.text)
        print("\033[1;31;34m")
        print(results["choices"][0]["message"]["content"])
        
        prompt_tokens = results["usage"]["prompt_tokens"]
        completion_tokens = results["usage"]["completion_tokens"]
        total_tokens = results["usage"]["total_tokens"]

        print("\n\033[1;31;32mDone:", now)
        print(f"Prompt tokens = {prompt_tokens} | Completion tokens = {completion_tokens} \
| Total tokens = {total_tokens}")
        print("\n[Note] These results are generated by an AI")
        print("\033[0m")
        
        return results
    
    elif response.status_code == 429:
        print(
            "[429 Error] Too many requests. Please wait a couple of seconds and try again.\n"
        )
        print(json.loads(response.text))

    else:
        print(f"[Error] Error code: {response.status_code}\n")
        print(json.loads(response.text))


## Analysis

In [None]:
prompt = "Classify this document into 'Driver licence', 'Passport', 'European Accident form', 'Others'"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "What is the language used in this document?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Generate a summary"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "What are the names, cars models of vehicles A and B?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Display some informations about the brand and model of the vehicle A"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "What are the damages for vehicles A and B?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Do we have injured people?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Do we have some witness?"

GPT4V_with_AzureAIVision(image_file, prompt)

### Let's analyse the drawings from the report

In [None]:
prompt = "What are the main colors of this document?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Do we have some handwritten text?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Explain the drawings from section number 10 for vehicles A and B"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "Explain the drawing from section number 13"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "What are the comments in section 14 for vehicles A and B?"

GPT4V_with_AzureAIVision(image_file, prompt)

In [None]:
prompt = "How many signatures do we have at the end of the document?"

GPT4V_with_AzureAIVision(image_file, prompt)

## Gradio webapp

In [None]:
def car_report_webapp_fn(pil_image):
    """
    Function for the Gradio webapp
    Input: pil image (pil format)
    output: results (string)
    """
    # Endpoint
    base_url = f"{openai.api_base}/openai/deployments/{model}"
    gpt4vision_endpoint = (
        f"{base_url}/extensions/chat/completions?api-version=2023-12-01-preview"
    )

    # Header
    headers = {"Content-Type": "application/json", "api-key": openai.api_key}

    # Encoded image
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    base_64_encoded_image = base64.b64encode(buffered.getvalue()).decode("ascii")

    # Context
    context = """You are an insurance AI expert. You will analyse a car report document. \
Always reply in Italian.
"""
    # Full prompt
    prompt = """
You respond with your analysis of the following fields:

1. Summary: Create a summary of this car report.
2. Names: What are the names of owners of vehicle A and B? \
Just answer like vehicle A = 'SMITH', Vehicle B = 'JOHNSON'
3. Vehicles: What is the brand and model of vehicle A and B? \
Just answer like vehicle A = 'AUDI', Vehicle B = 'MERCEDES'
4. Date and time: What is the date and time of the accident? \
Just answer like '01-jan-2023 22:00'
5. Address: What is the address of the accident? \
Just answer like '78 Avenue de Paris 75012 Paris'
6. Damage: Share some information about the damage.
Others damage: Display some information about material damage other than to vehicles A and B.
7. Injured people: Do we have injured people?
8. Section 14 comments: What are the comments in section 14?
9. Damage classification: Classify this damage as LIGHT DAMAGE, MEDIUM DAMAGE, SEVERE DAMAGE.
10. Drawings #10: Explain the drawings from section number 10 for vehicles A and B?
11. Drawing #13: Explain the drawing from section number 13?
12. Signatures: Do we have two signatures at the end of this document? \
Just answer like "Two signatures detected", "One signature detected", "No signature detected"
"""
    # Payload
    json_data = {
        "model": "gpt-4-vision-preview",
        "enhancements": {"ocr": {"enabled": True}, "grounding": {"enabled": True}},
        "dataSources": [
            {
                "type": "AzureComputerVision",
                "endpoint": azure_aivision_endpoint,
                "key": azure_aivision_key,
                "indexName": indexname,
            }
        ],
        "messages": [
            {"role": "system", "content": context},
            {"role": "user", "content": [prompt, {"image": base_64_encoded_image}]},
        ],
        "max_tokens": 4000,
        "temperature": 0.7,
        "top_p": 1,
    }

    # Results
    response = requests.post(
        gpt4vision_endpoint, headers=headers, data=json.dumps(json_data)
    )

    # Testing status code
    if response.status_code == 200:
        results = json.loads(response.text)
        print(results)
        
        res = results["choices"][0]["message"]["content"]
        summary = res.split("2. Names:")[0].replace("\n", "")
        sub1 = "2. Names:"
        res2 = sub1 + res.split(sub1)[1]
        sub2 = "10. Drawings #10:"
        insights = res2.split(sub2)[0]
        drawings = sub2 + res2.split(sub2)[1]
        return summary, insights, drawings

    elif response.status_code == 429:
        print(
            "[429 Error] Too many requests. Please wait a couple of seconds and try again."
        )
        print(json.loads(response.text))

    else:
        print("[Error] Error code:", response.status_code)
        print(json.loads(response.text))

In [None]:
image_url = "https://cdn4.iconfinder.com/data/icons/lined-car-accident/48/a-03-1024.png"
logo = "<center> <img src= {} width=70px></center>".format(image_url)
title = "Your car report Copilot - Azure Open AI GPT4 Turbo Vision with Azure AI enhancements"

inputs = gr.Image(type="pil", label="Your car report document") #.style(height=640)
outputs = [
    gr.Text(label="Car report summary"),
    gr.Text(label="Car report insights"),
    gr.Text(label="Car report drawings analysis")
]

example = glob.glob("car_report.jpg")
theme = "gradio/soft"  # https://huggingface.co/spaces/gradio/theme-gallery

car_report_webapp = gr.Interface(
    fn=car_report_webapp_fn,
    inputs=inputs,
    outputs=outputs,
    description=logo,
    title=title,
    examples=example,
    theme=theme,
)

car_report_webapp.launch(share=True)