In [None]:
#验证SDK token
from modelscope.hub.api import HubApi
api = HubApi()
api.login('f774872e-c878-4653-842e-d9d7e3982e47')

#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('OpenGVLab/InternVL2-Llama3-76B', cache_dir='/hy-tmp')

In [None]:
# 多卡时：
# pip install transformers==4.37.2

In [1]:
import numpy as np
import os
import torch
import torchvision.transforms as T
# from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import math
import json

weird_str=["```json\n", "\n```", '\n      ', '\n    ', '\n  ', '\n']
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    num_layers = {
        'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
        'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.7))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.3)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

In [2]:
class chat_internvl(torch.nn.Module):
    def __init__(self, model_path, img_prefix, semantic_prefix, json_prefix, output_json_prefix):
        super().__init__()
        self.img_prefix = img_prefix
        self.semantic_prefix = semantic_prefix
        self.json_prefix = json_prefix
        self.output_json_prefix = output_json_prefix
        split_model_path=model_path.split('/')[-1]
        device_map = split_model(split_model_path)
        self.model = AutoModel.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        use_flash_attn=True,
        trust_remote_code=True,
        device_map=device_map).eval()

        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)

    def forward(self, img):
        img_path = self.img_prefix+str(img)
        json_path = self.json_prefix+img.split('.')[0]+'_info.json'
        with open(json_path) as f:
            s=json.load(f)
        
        pixel_values = load_image(img_path, max_num=12).to(torch.bfloat16).cuda()
        generation_config = dict(max_new_tokens=1024, do_sample=False)

        '''
        对输入的bbox处理能力弱，
        对semantic label错误的情况无法改正（很多seg结果的本来也很奇怪），
        distortion description无法incorporate具体的semantic
        '''

        prompt_gen = []
        prompt_gen.append('There are {} main visual elements in this image'.format(len(s['annotations'])))
        for idx, i in enumerate(s['annotations']):
            dis_list = []
            for x, y in zip(i['distortion_type'], i['distortion_level']):
                sstr = x+'(level '+ str(y+1)+')'
                dis_list.append(sstr)
            output_dis = (', ').join(dis_list)
            sstr1 = 'element {}: bounding box: [{},{}],[{},{}]; semantic label reference: {}; distortion type and level: {}' \
        .format(str(idx+1), i['bbox'][0],i['bbox'][1],i['bbox'][0]+i['bbox'][2],i['bbox'][1]+i['bbox'][3],i['class_name'],output_dis)
        #     sstr1 = 'element {}: bounding box: [{},{}],[{},{}]; distortion type and level: {}' \
        # .format(str(idx+1), i['bbox'][0],i['bbox'][1],i['bbox'][2],i['bbox'][3],output_dis)
            prompt_gen.append(sstr1)
        question_im = ('. ').join(prompt_gen)
        
#         basic_prompt = 'provide the most accurate semantic label for each visual element in each bounding box. The semantic label should be specific, for example, instead of output the label as "pen", you might specify it to "crayon". \
# Make sure the bounding box corresponds to the right visual element, especially the bounding boxes have big overlaps. \
# These are the rules you have to follow: \
# 1. The coordinate origin (0,0) of bounding box is at the top-left corner of the image. The given coordinates is the top-left and bottom-right corners, respectively. \
# 2. Each bounding box should only contain one intact visual elements, it can either be a foreground object(e.g., cat, bird) or just the background(e.g., sky, floor). \
# 3. Either an object or the background is entirely within the boundaries of the bounding box, and avoid recognizing targets that just partially within the boundaries in the scenes with dense objects. \
# 4. the provided semantic label reference are probably wrong. You have to output accurate semantic labels by your own. \
# 5. If the semantic labels in your output are identical, please add distinguishing details to differentiate them. For example, instead of labeling both as "bird," you might specify "blue bird" for one of them. \
# 6. The distortion levels are categorized into three tiers: Level 1 is the lowest, while Level 3 is the highest. '

# 3. The target visual element is entirely within the boundaries of the bounding box, and avoid recognizing wrong targets that just partially within the boundaries in the scenes with dense objects. \
         # If the bounding box contains multiple visual elements, identify the taget visual element whose entire area is contained within the box. 
# avoid identifying wrong targets whose edges are beyond the boundaries especially when the bounding box contains multiple visual elements. \

        basic_prompt = 'provide the most accurate semantic label for the target visual element in each bounding box. The semantic label should be specific, for example, instead of output the label as "pen", you might specify it to "crayon". \
These are the rules you have to follow: \
1. The coordinate origin (0,0) of bounding box is at the top-left corner of the image. The given coordinates of the bounding box is the top-left and bottom-right corners, respectively. \
2. There is only one target visual elements in each bounding box, it can either be a foreground object(e.g., cat, bird) or just the background(e.g., sky, floor). \
3. If the bounding box contains multiple objects, identify the taget visual element whose entire area is contained within the box.  \
4. The provided semantic label references are probably wrong. You have to output accurate semantic labels by your own. \
5. If the semantic labels in your output are identical, please add distinguishing details to differentiate them. For example, instead of labeling both as "bird," you might specify "blue bird" for one of them. \
6. The distortion levels are categorized into three tiers: Level 1 is the lowest, while Level 3 is the highest. '

        
        # 此次不需要收集answer，不给format
        # question = basic_prompt + question_im
        question = "<image>\n" + question_im + basic_prompt
        
        response, history = self.model.chat(self.tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
        print(f'User: {question}\nAssistant: {response}')

        # 只留风景照，植物的也不要，其他的更不要了

        question_base = 'You have generate descriptions for the image. Take use of the accurate semantic lable you have generated in last question. \
Firstly, give a mixed description of 3 acpects for each visual element in each bounding box. \
The mixed description is compriseed of 3 acpects: 1.Basic Information: the type, color, and any notable features of the target; 2.Position and Orientation of the target; \
3.the visual effects of each distortion, if there are more than one distortions, state the the visual effects of each distortion one by one. \
The description should not contain the number of level, bounding box, use vivid words to replace them. '

        question_format = 'The output must be a raw json format, I will give you an example and do not imitate the sentence structure in the example, make it diverse. \
If the Basic Information is: "There is a building with a white exterior. It has a window and a door, both of which are made of glass and wood, respectively.", \
and Position and Orientation information is: "The building occupies the left side of the image, with the window and door positioned centrally.", \
and the Distortion Effects: 1.Gaussian Blur(Level 1): "The edges of the building appear slightly blurred, reducing the sharpness of the structure.", 2.Mean Shift(Level 2): "The colors of the building are slightly shifted, especially the roof, making the white exterior appear less bright and more muted."}, \
Then, the output should be like: \
{"element description": {"element 1": {"semantic label": "building", "bounding box": [[12, 34], [56, 78]], \
"mixed description": "A building with a white exterior is positioned on the left side, featuring a glass window and a wooden door centrally placed. The edges are slightly blurred due to a Gaussian effect, giving a soft, hazy look. Additionally, a mean shift effect has muted the colors, \
making the white exterior and roof appear less bright and more subdued."}}}'
        
        question = question_base + question_format
        response, history = self.model.chat(self.tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
        print(f'User: {question}\nAssistant: {response}')
        caption1_json_path = os.path.join(self.output_json_prefix, 'caption1', img.split('.')[0]+'.json')
        for i in range(len(weird_str)):
            response=response.replace(weird_str[i],'')
        with open(caption1_json_path, 'w') as f:
            json.dump(json.loads(response), f, indent=4)
        # with open(caption1_json_path, 'w') as f:
        #     json.dump(response,f)

        question_base = 'refer to the mixed description for each visual element the and give a global description about this whole image, you should mention every element and the whole image structure, \
especially for the impact of the distortions and quality evaluation. \
The description should not contain the number of level, bounding box, use vivid words to replace them. '

        question_format = 'The output must be a raw json format, this is a format example and do not imitate the sentence structure in the example, make it diverse. \
{"global description": "The image showcases a scene with a building dominating the background, with three distinct elements in the foreground: a fire hydrant, a wooden fence, and a plant. The building, occupying most of the image, is rendered with a soft blur and shifted colors, \
giving it a slightly hazy and surreal appearance. This creates a backdrop that feels out of focus and less defined. In the foreground, the fire hydrant stands out with its colors subtly diffused, making it less vibrant and somewhat muted. Nearby, the wooden fence appears smeared due to motion blur, \
with its colors slightly intensified and softened by additional blurring. This combination results in a fence that lacks clear definition and sharpness. Finally, the plant is depicted under dim lighting, making it appear darker and less prominent. Its colors are intensely vivid, \
but the details are compromised due to a resizing effect that has softened its edges and textures."}'
        
        question = question_base + question_format
        response, history = self.model.chat(self.tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
        print(f'User: {question}\nAssistant: {response}')
        caption2_json_path = os.path.join(self.output_json_prefix, 'caption2', img.split('.')[0]+'.json')
        for i in range(len(weird_str)):
            response=response.replace(weird_str[i],'')
        with open(caption2_json_path, 'w') as f:
            json.dump(json.loads(response), f, indent=4)
        # with open(caption2_json_path, 'w') as f:
        #     json.dump(response,f)

        question = 'Give a description about the spatial relations of each visual element of each bounding box in this image.'.format(str(len(s['annotations'])))
        question_format = 'The answer must be a json format. This is an example: {"spatial relations of all elements": {"element 1": {"bounding box": [[12, 34], [56, 78]], "spatial relations": "The cabinet is positioned in the background, behind the keyboard and synthesizer. It is placed against the wall and is partially visible due to the angle of the image."}'
        response, history = self.model.chat(self.tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
        print(f'User: {question}\nAssistant: {response}')
        spatial_json_path = os.path.join(self.output_json_prefix, 'spatial', img.split('.')[0]+'.json')
        for i in range(len(weird_str)):
            response=response.replace(weird_str[i],'')
        with open(spatial_json_path, 'w') as f:
            json.dump(json.loads(response), f, indent=4)
        # with open(spatial_json_path, 'w') as f:
        #     json.dump(response,f)
        
        question_base1 = 'use the spatial relations of {} main visual element in the previous question to generate {} referring questions for {} objects. \
The question must include spatial relations of the visual elements. \
The answer must be the distortion of the visual elements. Modify the level to diverse adjectives. For example, modify "jpeg compression(level 1)" to "moderate jpeg compression".'\
        .format(str(len(s['annotations'])),str(len(s['annotations'])),str(len(s['annotations'])))
        question_format1 = 'The output must be a json format, follow this example: {"referring": {"element 1": {"question": "What is the distortion of the book in the lower-right corner?", "answer": "Minor jpeg compression, severe motion blur."}}}'
        question = question_base1 + question_format1
        response, history = self.model.chat(self.tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
        print(f'User: {question}\nAssistant: {response}')
        referring_json_path = os.path.join(self.output_json_prefix, 'referring', img.split('.')[0]+'.json')
        for i in range(len(weird_str)):
            response=response.replace(weird_str[i],'')
        with open(referring_json_path, 'w') as f:
            json.dump(json.loads(response), f, indent=4)
        # with open(referring_json_path, 'w') as f:
        #     json.dump(response,f)

In [3]:
# ~/.cache/modelscope/hub
chat=chat_internvl(model_path='/root/autodl-tmp/pretrained/OpenGVLab/InternVL2-Llama3-76B', 
                   img_prefix='/root/autodl-tmp/example/kadis_output/', 
                   semantic_prefix='/root/autodl-tmp/example/semantic/', 
                   json_prefix='/root/autodl-tmp/example/json/', 
                   output_json_prefix='/root/autodl-tmp/example/chat/'
                   )

FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/32 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
chat('6-1238605.png')

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: <image>
There are 7 main visual elements in this image. element 1: bounding box: [0,0],[511,383]; semantic label reference: a white surface; distortion type and level: JPEG_compression(level 2), lens blur(level 3), motion blur(level 2). element 2: bounding box: [272,111],[469,312]; semantic label reference: brightly colored pens; distortion type and level: pixelate(level 1), color shift(level 2), contrast change(level 2). element 3: bounding box: [220,106],[423,298]; semantic label reference: brightly colored pens; distortion type and level: color saturate(level 2). element 4: bounding box: [168,100],[379,283]; semantic label reference: brightly colored pens; distortion type and level: color diffuse(level 2). element 5: bounding box: [120,93],[337,268]; semantic label reference: different colors; distortion type and level: jitter(level 2), white noise(level 1). element 6: bounding box: [73,87],[298,253]; semantic label reference: different colors; distortion type and level: lens 

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: You have generate descriptions for the image. Take use of the accurate semantic lable you have generated in last question. Firstly, give a mixed description of 3 acpects for each visual element in each bounding box. The mixed description is compriseed of 3 acpects: 1.Basic Information: the type, color, and any notable features of the target; 2.Position and Orientation of the target; 3.the visual effects of each distortion, if there are more than one distortions, state the the visual effects of each distortion one by one. The description should not contain the number of level, bounding box, use vivid words to replace them. The output must be a raw json format, I will give you an example and do not imitate the sentence structure in the example, make it diverse. If the Basic Information is: "There is a building with a white exterior. It has a window and a door, both of which are made of glass and wood, respectively.", and Position and Orientation information is: "The building occupi

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.46 GiB. GPU 0 has a total capacty of 47.50 GiB of which 3.26 GiB is free. Process 453262 has 44.24 GiB memory in use. Of the allocated memory 34.43 GiB is allocated by PyTorch, and 9.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [6]:
import torch
torch.cuda.empty_cache()  # 释放显存

In [5]:
chat('121-cc-774921.png')

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: <image>
There are 6 main visual elements in this image. element 1: bounding box: [0,37],[433,380]; semantic label reference: a keyboard; distortion type and level: gaussian blur(level 1), over exposure(level 1). element 2: bounding box: [210,154],[507,382]; semantic label reference: a synthesizer; distortion type and level: sharpen(level 2). element 3: bounding box: [82,232],[511,382]; semantic label reference: table; distortion type and level: bicubic interpolation resize(level 1), color block(level 2), bilinear interpolation resize(level 2). element 4: bounding box: [417,17],[511,221]; semantic label reference: cabinet; distortion type and level: JPEG_compression(level 2). element 5: bounding box: [0,35],[252,149]; semantic label reference: table; distortion type and level: gaussian blur(level 3). element 6: bounding box: [275,0],[511,76]; semantic label reference: table; distortion type and level: over exposure(level 2), bicubic interpolation resize(level 1), lanczos interpola

In [4]:
chat('abandoned-1596853.png')

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: <image>
Provide a semantic label for each visual element in each bounding box in this image. These are some rules you have to follow: 1. Each bounding box only contains one visual element, it can either be a foreground object(e.g., cat, bird) or just the background(e.g., sky, floor). 2. Either an object or the background is entirely within the boundaries of the bounding box, and avoid recognizing wrong targets that just partially extend beyond these boundaries. 3. Use the provided semantic label hints for reference, but note that they may not always be accurate. 4. If the semantic labels in your output are identical, please add distinguishing details to differentiate them. For example, instead of labeling both as "bird," you might specify "blue bird" for one of them. 5. The distortion levels are categorized into three tiers: Level 1 is the lowest, while Level 3 is the highest. There are some infomation about the input image. There are 4 main visual elements in this image. element

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: You have generate descriptions for the image. Take use of the semantic lable you have generated in last question. Firstly, give a mixed description of 3 acpects for each visual element in each bounding box. The mixed description is compriseed of 3 acpects: 1.Basic Information: the type, color, and any notable features of the target; 2.Position and Orientation of the target; 3.the visual effects of each distortion, if there are more than one distortions, state the the visual effects of each distortion one by one. The description should not contain the number of level, bounding box, use vivid words to replace them. The output must be a raw json format, I will give you an example and do not imitate the sentence structure in the example, make it diverse. If the Basic Information is: "There is a building with a white exterior. It has a window and a door, both of which are made of glass and wood, respectively.", and Position and Orientation information is: "The building occupies the le

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: refer to the mixed description for each visual element the and give a global description about this whole image, you should mention every element and the whole image structure, especially for the impact of the distortions and quality evaluation. The description should not contain the number of level, bounding box, use vivid words to replace them. The output must be a raw json format, this is a format example and do not imitate the sentence structure in the example, make it diverse. {"global description": "The image showcases a scene with a building dominating the background, with three distinct elements in the foreground: a fire hydrant, a wooden fence, and a plant. The building, occupying most of the image, is rendered with a soft blur and shifted colors, giving it a slightly hazy and surreal appearance. This creates a backdrop that feels out of focus and less defined. In the foreground, the fire hydrant stands out with its colors subtly diffused, making it less vibrant and some

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: Give a description about the spatial relations of each visual element of each bounding box in this image.
Assistant: ```json
{
  "spatial relations": {
    "element 1": {
      "semantic label": "building",
      "spatial description": "The building occupies the left side of the image, with its window and door positioned centrally. It serves as the background element, providing a backdrop for the other visual elements."
    },
    "element 2": {
      "semantic label": "fire hydrant",
      "spatial description": "The fire hydrant is located near the center of the image, slightly to the left. It is positioned in front of the building and to the left of the wooden fence."
    },
    "element 3": {
      "semantic label": "wooden fence",
      "spatial description": "The wooden fence is situated towards the right side of the image. It is positioned in front of the building and to the right of the fire hydrant."
    },
    "element 4": {
      "semantic label": "plant",
      "spati

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: use the spatial relations of 4 main visual element in the previous question to generate 4 referring questions for 4 objects. The question must include spatial relations of the visual elements. The answer must be the distortion of the visual elements. Modify the level to diverse adjectives. For example, modify "jpeg compression(level 1)" to "moderate jpeg compression".The output must be a json format, follow this example: {"referring": {"element 1": {"question": "What is the distortion of the book in the lower-right corner?", "answer": "Minor jpeg compression."}}}
Assistant: ```json
{
  "referring": {
    "element 1": {
      "question": "What is the distortion of the building on the left side of the image?",
      "answer": "Moderate Gaussian blur and slight mean shift."
    },
    "element 2": {
      "question": "What is the distortion of the fire hydrant near the center of the image?",
      "answer": "Mild color diffuse effect."
    },
    "element 3": {
      "question": "Wh

In [4]:
chat('753-66364.png')

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: <image>
Provide a semantic label for each visual element in each bounding box in this image. These are some rules you have to follow: 1. Each bounding box only contains one visual element, it can either be a foreground object(e.g., cat, bird) or just the background(e.g., sky, floor). 2. Either an object or the background is entirely within the boundaries of the bounding box, and avoid recognizing wrong targets that just partially extend beyond these boundaries. 3. Use the provided semantic label hints for reference, but note that they may not always be accurate. 4. If the semantic labels in your output are identical, please add distinguishing details to differentiate them. For example, instead of labeling both as "bird," you might specify "blue bird" for one of them. 5. The distortion levels are categorized into three tiers: Level 1 is the lowest, while Level 3 is the highest. There are some infomation about the input image. There are 5 main visual elements in this image. element

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: You have generate descriptions for the image. Take use of the semantic lable you have generated in last question. Firstly, give a mixed description of 3 acpects for each visual element in each bounding box. The mixed description is compriseed of 3 acpects: 1.Basic Information: the type, color, and any notable features of the target; 2.Position and Orientation of the target; 3.the visual effects of each distortion, if there are more than one distortions, state the the visual effects of each distortion one by one. The description should not contain the number of level, bounding box, use vivid words to replace them. The output must be a raw json format, I will give you an example and do not imitate the sentence structure in the example, make it diverse. If the Basic Information is: "There is a building with a white exterior. It has a window and a door, both of which are made of glass and wood, respectively.", and Position and Orientation information is: "The building occupies the le

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: refer to the mixed description for each visual element the and give a global description about this whole image, you should mention every element and the whole image structure, especially for the impact of the distortions and quality evaluation. The description should not contain the number of level, bounding box, use vivid words to replace them. The output must be a raw json format, this is a format example and do not imitate the sentence structure in the example, make it diverse. {"global description": "The image showcases a scene with a building dominating the background, with three distinct elements in the foreground: a fire hydrant, a wooden fence, and a plant. The building, occupying most of the image, is rendered with a soft blur and shifted colors, giving it a slightly hazy and surreal appearance. This creates a backdrop that feels out of focus and less defined. In the foreground, the fire hydrant stands out with its colors subtly diffused, making it less vibrant and some

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: Give a description about the spatial relations of each visual element of each bounding box in this image.
Assistant: ```json
{
  "spatial relations": {
    "element 1": {
      "semantic label": "kimono",
      "spatial relation": "The person wearing the kimono is positioned centrally in the image, slightly towards the left side. This individual is standing in front of the red torii gate, with the gate's structure visible in the background."
    },
    "element 2": {
      "semantic label": "kimono",
      "spatial relation": "The other person wearing the kimono is also positioned centrally, slightly towards the right side. This individual is standing in front of the red torii gate, with the gate's structure visible in the background, similar to the first person."
    },
    "element 3": {
      "semantic label": "front view of a person",
      "spatial relation": "The front view of the person wearing the kimono is prominently displayed in the center of the image. This person is 

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


User: use the spatial relations of 5 main visual element in the previous question to generate 5 referring questions for 5 objects. The question must include spatial relations of the visual elements. The answer must be the distortion of the visual elements. Modify the level to diverse adjectives. For example, modify "jpeg compression(level 1)" to "moderate jpeg compression".The output must be a json format, follow this example: {"referring": {"element 1": {"question": "What is the distortion of the book in the lower-right corner?", "answer": "Minor jpeg compression, severe motion blur."}}}
Assistant: ```json
{
  "referring": {
    "element 1": {
      "question": "What is the distortion of the kimono worn by the person positioned slightly to the left in the image?",
      "answer": "Moderate color saturation."
    },
    "element 2": {
      "question": "What is the distortion of the kimono worn by the person positioned slightly to the right in the image?",
      "answer": "Mild pixelat

In [None]:
chat('3d-anaglyph-738967.png')

Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.


In [15]:
major, minor = torch.cuda.get_device_capability(0)
# Check if the GPU architecture is Ampere (SM 8.x) or newer (SM 9.0)
is_sm8x = major == 8 and minor >= 0
is_sm90 = major == 9 and minor == 0
print('Use flash attn:', is_sm90 or is_sm90)

Use flash attn: False


In [None]:
all=[]
for i in os.listdir('/root/autodl-tmp/example/try/'):
    if i.endswith('png'):
        a = chat(i)
    all.append(a)
    
with open('/root/autodl-tmp/example/chat/chat.json','w') as f:
    json.dump(all,f)

In [None]:
# 第一种：自建数据集，纯mllm标注
'I want you to act as an image data annotator especially for quality assessment. You will be responsible for understanding the content and distortions of the image referring to the region divisions of the image. Ignore the region that does not contain a identifier.
Input image has {} segment region divisions, each outlined with white boundaries and labeled with a unique rectangular identifier which has a
number for the region. The number starts from 1, and is carefully marked within the corresponding region, be careful a bout this. 

region {} is about {}, it has a distortion of {};

Describe each region with its corresponding number.

In [None]:
# 第二种：自然失真数据集，纯mllm标注


In [None]:
# 第三种：自然失真数据集，human + mllm标注
