# 1.ViT图像语言推理

In [1]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
from pathlib import Path
import os
from glob import glob
from tqdm import tqdm
import time
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [58]:
# Define path to input directory containing images
input_dir = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie"

all_data = []

# Move model to CUDA device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict(input_dir):
  # images = []
  for image_path in tqdm(Path(input_dir).glob("*.png")):

    image = Image.open(image_path)
    # Pre-process the image for the DETR model
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)


    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

  return preds
  # all_data.append(preds)

In [59]:
import pandas as pd

# Define path to output CSV file
output_file = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/predictions.csv"

# Save data to CSV file
df = pd.DataFrame.from_records(predict(input_dir), columns=["filename"])
df.set_index("filename", inplace=True)
df.to_csv(output_file)

15775it [1:17:10,  3.41it/s]


ValueError: 1 columns passed, passed data had 41 columns

In [50]:
import json
# 将结果写入CSV文件
# 将预测结果写入CSV文件
with open('E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/predictions.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # 写入表头
    writer.writerow(['Predictions'])
    
    preds = predict(input_dir)
    # 逐行写入数据
    grouped_preds = [preds[i:i+4] for i in range(0, len(preds), 4)]
    for group_preds in grouped_preds:
        predictions_str = ', '.join([json.dumps(pred) for pred in group_preds])
        writer.writerow([predictions_str])

4it [00:01,  3.15it/s]


In [9]:
def predict_step(image_paths):
  # images = []
  for image_path in image_paths:

    image = Image.open(image_path)
    # Pre-process the image for the DETR model
    inputs = feature_extractor(images=image, return_tensors="pt")
    inputs.to(device)
    # Use the DETR model to predict object bounding boxes and labels
    # outputs = model(**inputs)

    # i_image = Image.open(image_path)
    # if i_image.mode != "RGB":
    #   i_image = i_image.convert(mode="RGB")

    # images.append(i_image)

    # 更新读取图片进度
    total_bar.update(1);
    #total_bar.write(f"{total_bar.n}/{total_bar.total} images processed") 
    
  # pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  # pixel_values = pixel_values.to(device)

  output_ids = model.generate(inputs, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]

  return preds

In [4]:
# 执行预测
image_list = [str(p) for p in glob("E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie/*.png")]
# 设置总体进度条
total_bar = tqdm(total=len(image_list), desc='Predicting images', position=0)
preds = predict_step(image_list)
print(preds)

Predicting images: 100%|█████████▉| 15772/15775 [11:29<00:00, 25.82it/s]

AttributeError: 

Predicting images: 100%|██████████| 15775/15775 [11:40<00:00, 25.82it/s]

# 2.语义分割

In [None]:
%%time
import torch
import os
from pathlib import Path
import pandas as pd
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from PIL import Image
from tqdm import tqdm

# Define the feature extractor and segmentation model
feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").cuda()

# Define path to input directory containing images
input_dir = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie"

# Define path to output CSV file
output_file = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/seg.csv"

# Get list of class names as column names
class_names = list(model.config.id2label.values())[1:]

# Loop over all image files in the input directory
all_data = []
for image_path in tqdm(Path(input_dir).glob("*.png")):
    # Load image using PIL
    image = Image.open(image_path)

    # Pre-process the image for the Segformer model
    inputs = feature_extractor(images=image, return_tensors="pt")

    # Use the Segformer model to predict pixel-wise class labels
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
    predictions = torch.argmax(logits, dim=1)  # shape (batch_size, height/4, width/4)

    # Compute object proportions
    class_proportions = {cls: 0.0 for cls in class_names}
    total_pixels = predictions.shape[1] * predictions.shape[2]
    for i, cls_name in enumerate(class_names, start=1):
        mask = (predictions[0] == i).to(torch.float)
        count = mask.sum().item()
        proportion = round(count / total_pixels * 100, 2)
        class_proportions[cls_name] = proportion
        #print(f"Class {cls_name} has {class_proportions[cls_name]} of image pixels")

    # Append data to list of all data
    data = {"filename": image_path.name}
    data.update(class_proportions)
    data["total_pixels"] = total_pixels
    all_data.append(data)

# Save data to CSV file
df = pd.DataFrame.from_records(all_data, columns=["filename"] + class_names + ["total_pixels"])
df.set_index("filename", inplace=True)
df.to_csv(output_file)

  from .autonotebook import tqdm as notebook_tqdm
15775it [27:47,  9.46it/s]


CPU times: total: 23min 10s
Wall time: 28min 14s


# 3.物体检测

In [4]:
%%time
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
import os
from pathlib import Path
import pandas as pd
from PIL import Image
from tqdm import tqdm

# Define path to input directory containing images
input_dir = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie"

# Define path to output CSV file
output_file = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/obj.csv"

# Load model
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# Move model to CUDA device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loop over all image files in the input directory
all_data = []
for image_path in tqdm(Path(input_dir).glob("*.png")):
    # Load image using PIL
    image = Image.open(image_path)
    
    # Pre-process the image for the DETR model
    inputs = processor(images=image, return_tensors="pt")
    inputs.to(device)

    # Use the DETR model to predict object bounding boxes and labels
    outputs = model(**inputs)

    # Convert outputs (bounding boxes and class logits) to COCO API format
    # Let's only keep detections with score > 0.9
    target_sizes = torch.tensor([image.size[::-1]]).to(device)
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    # Compute object counts
    counts = {cls: 0 for cls in model.config.id2label.values()}
    for label in results["labels"]:
        cls_name = model.config.id2label[label.item()]
        counts[cls_name] += 1

    # Append data to list of all data
    data = {"filename": image_path.name}
    data.update(counts)
    all_data.append(data)

    # Print object predictions
    #print(f"\nObject Predictions for {image_path.name}:")
    #for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        #cls_name = model.config.id2label[label.item()]
        #if score.cpu().item() > 0.9:
            #box = [round(i, 2) for i in box.tolist()]
            #print(f"Detected {cls_name} with confidence {round(score.item(), 3)} at location {box}")

# Save data to CSV file
df = pd.DataFrame.from_records(all_data, columns=["filename"] + list(model.config.id2label.values()))
df.set_index("filename", inplace=True)
df.to_csv(output_file)

15775it [28:51,  9.11it/s]


CPU times: total: 26min 11s
Wall time: 28min 54s


In [5]:
print(scores.shape)

torch.Size([1, 100])


# 4.深度估计

In [5]:
#不太适用
from transformers import DPTImageProcessor, DPTForDepthEstimation
import torch
import numpy as np
import os
from PIL import Image
import csv

processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

# directory containing the images
image_dir = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie/"

# create a new CSV file to store depth distances
with open("E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/depth_distances.csv", mode="w", newline='') as file:
    writer = csv.writer(file)

    # list all image files in the directory
    image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

    # loop through the images and calculate depth distances
    for i, path1 in enumerate(image_paths):
        # open the first image
        image1 = Image.open(path1)

        # prepare image for the model
        inputs1 = processor(images=image1, return_tensors="pt")

        with torch.no_grad():
            outputs1 = model(**inputs1)
            predicted_depth1 = outputs1.predicted_depth

        # interpolate to original size
        prediction1 = torch.nn.functional.interpolate(
            predicted_depth1.unsqueeze(1),
            size=image1.size[::-1],
            mode="bicubic",
            align_corners=False,
        )

        # get depth values as a numpy array
        depths1 = prediction1.squeeze().cpu().numpy()

        # loop through the rest of the images and calculate distance to each one
        for j, path2 in enumerate(image_paths[i+1:], i+1):
            # open the second image
            image2 = Image.open(path2)

            # prepare image for the model
            inputs2 = processor(images=image2, return_tensors="pt")

            with torch.no_grad():
                outputs2 = model(**inputs2)
                predicted_depth2 = outputs2.predicted_depth

            # interpolate to original size
            prediction2 = torch.nn.functional.interpolate(
                predicted_depth2.unsqueeze(1),
                size=image2.size[::-1],
                mode="bicubic",
                align_corners=False,
            )

            # get depth values as a numpy array
            depths2 = prediction2.squeeze().cpu().numpy()

            # calculate depth distance in meters
            distance = np.abs(depths1 - depths2).mean() / 1000.0
                        
            # write depth distance to CSV file
            row = [os.path.basename(path1), os.path.basename(path2), distance]
            writer.writerow(row)

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution2.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: operands could not be broadcast together with shapes (512,2048) (512,1536) 

# 5.Opencv提取特征

In [1]:
import cv2
import os
import csv
from tqdm import tqdm

# 定义要处理的文件夹路径
folder_path = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/baidu2023_pinjie"

# 定义输出CSV文件路径
output_file_path = "E:/Dataset/GNN_Perception/wuhan_badu_SVI/2023/pixels.csv"

# 遍历文件夹中的所有文件
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # 读取图像
        img = cv2.imread(os.path.join(folder_path, filename))
        
        # 转换为HSV颜色空间
        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        
        # 提取色相、饱和度、亮度
        h, s, v = cv2.split(hsv_img)

        # 提取边缘
        edges = cv2.Canny(img, 100, 200)
        
        # 获取阈值
        ret, thresh = cv2.threshold(s, 50, 255, cv2.THRESH_BINARY)
        
        # 将处理后的数据存储到CSV文件中
        with open(output_file_path, mode='a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            row = [filename, h.mean(), s.mean(), v.mean(), edges.mean(), thresh.mean()]
            writer.writerow(row)

100%|██████████| 15775/15775 [15:33<00:00, 16.89it/s]


In [3]:
import cv2
import os
import csv

# 定义要处理的文件夹路径
folder_path = "E:/Dataset/GNN_Perception/SVI/remove"

# 定义输出CSV文件路径
output_file_path = "E:/Dataset/GNN_Perception/SVI/wuhan_pixels.csv"

# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # 读取图像
        img = cv2.imread(os.path.join(folder_path, filename))
        
        # 检查输入图像是否为空
        if img is None:
            print(f"Failed to read image {filename}")
            continue
        
        # 转换为HSV颜色空间
        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        
        # 提取色相、饱和度、亮度
        h, s, v = cv2.split(hsv_img)

        # 提取边缘
        edges = cv2.Canny(img, 100, 200)
        
        # 获取阈值
        ret, thresh = cv2.threshold(s, 50, 255, cv2.THRESH_BINARY)
        
        # 将处理后的数据存储到CSV文件中
        with open(output_file_path, mode='a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            row = [filename, h.mean(), s.mean(), v.mean(), edges.mean(), thresh.mean()]
            writer.writerow(row)