# 展示Gemini在视觉方面的能力

In [2]:
import env
import utils
# Generation config
config = {"max_output_tokens": 2048, "temperature": 0.4, "top_p": 1, "top_k": 32}

In [3]:
import google.generativeai as genai

# llm = genai.GenerativeModel('gemini-pro-vision') // 已经废弃了
llm = genai.GenerativeModel('gemini-1.5-flash')

In [3]:
# case 1: 描述图像
import PIL.Image

img = PIL.Image.open('./images/damao.jpg')

response = llm.generate_content(img)

utils.to_markdown(response.text)

> This is a picture of a cat wearing a colorful toy collar. The cat is a tabby with white stripes and is sitting next to a human hand. The background is a pink wall. The cat is looking away from the camera. The image appears to be taken indoors.

In [4]:
# case 1: 描述图像
import PIL.Image

img = PIL.Image.open('./images/bridge.jpeg')
response = llm.generate_content(img)
utils.to_markdown(response.text)

> This is the  **Pingba Bridge** over the **Wujiang River** in **Guizhou**, **China**.

In [9]:
img = PIL.Image.open('./images/jls.JPG')
response = llm.generate_content(img)
utils.to_markdown(response.text)

> This is a picture of a bridge with a large archway. The bridge is made of concrete and has a metal railing on either side. The archway is very tall and has a large, open space in the middle. The bridge is surrounded by trees and hills. The sky is clear and blue. The image is taken from the side of the bridge and looks down towards the archway. The picture is in focus and well-lit.

In [4]:
# case 1: 描述图像
import PIL.Image

images = ["./images/xinhai.JPG", "./images/home.JPG", "./images/home2.JPG", "./images/jls.JPG"]
outputs = []
for url in images:
    img = PIL.Image.open(url)
    response = llm.generate_content(img)
    output = utils.to_markdown(response.text)
    outputs.append(output)

outputs

[<IPython.core.display.Markdown object>,
 <IPython.core.display.Markdown object>,
 <IPython.core.display.Markdown object>,
 <IPython.core.display.Markdown object>]

In [5]:
outputs[0]

> This is a photo of a group of people walking by a bridge. They are all wearing winter clothes and some have their faces covered. The bridge is grey and has a lot of cables. In the background, the sky is blue and there are several seagulls flying. The photo is taken from a low angle. The shadows of the people and the bridge are long and thin. The photo has a sense of movement and energy.

In [12]:
# 使用langchain wrapper
from langchain_google_genai import ChatGoogleGenerativeAI

# langchain_model = ChatGoogleGenerativeAI(model="gemini-pro-vision")
langchain_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [7]:
# lanchain model
from langchain_google_genai import ChatGoogleGenerativeAI

# llm = ChatGoogleGenerativeAI(model="gemini-pro")
langchain_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [8]:
# case 2: 图像识别
from langchain_core.messages import HumanMessage

# example
message = HumanMessage(
    content=[
        {
            "type": "text",
            "text": "What's in this image & where is this image?",
        },  
        {"type": "image_url", "image_url": "https://picsum.photos/seed/picsum/200/300"},
    ]
)
langchain_model.invoke([message])

AIMessage(content='The image shows a snow-capped mountain peak with a pink and purple sunset sky. It is likely taken in the Alps, as the mountain shape is reminiscent of the Matterhorn.')

In [9]:
# case 2: 图像识别
from langchain_core.messages import HumanMessage

urls = [
    "https://live.staticflickr.com/4127/5089855498_8aafa4a66e.jpg", # 奥体
    "https://live.staticflickr.com/4111/5089274647_465b572424.jpg", # 大连中山广场
    "https://live.staticflickr.com/4124/5088895803_c58698f557_w.jpg", # 颐和园
    "https://live.staticflickr.com/4037/5088900151_97d6c76e96_w.jpg", # 山西李家山村
    "http://www.people.com.cn/NMediaFile/2023/1217/MAIN1702781405879FFZQBX0NHJ.jpg", # 北海公园
]
# example
outputs = []
for url in urls:
    message = HumanMessage(
        content=[
            {
                "type": "text",
                "text": "Where is this image?",
            },  
            {"type": "image_url", "image_url": f"{url}"},
        ]
    )
    output = langchain_model.invoke([message])
    outputs.append(output)
    
outputs

[AIMessage(content="This image appears to be taken inside the Bird's Nest stadium in Beijing, China. This is based on the shape of the stadium, the red seats, and the overall architecture. \n"),
 AIMessage(content='This image appears to be taken in a park or public square. There are many pigeons in the image, and it looks like there may be a public building in the background. However, there are no identifying features visible in this image that would allow me to determine the exact location.'),
 AIMessage(content='I do not see any image in your message. Please provide an image. \n'),
 AIMessage(content='This image appears to be from the Shanxi province of China. It depicts an ancient walled city, known as a "yao" in Chinese. These structures are often built on hillsides and have served as defensive strongholds throughout Chinese history. \n\nHowever, to pinpoint the exact location, I\'d need more information about the specific architecture or historical context of the image.  \n'),
 AI