<a href="https://colab.research.google.com/github/MoqiSheng/MoqiSheng.github.io/blob/main/baseline_clip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-s3apyi_p
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-s3apyi_p
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [3]:
import pandas as pd
import torch
import clip
from sklearn.preprocessing import normalize
import os

# 获取脚本所在的目录路径并切换当前工作目录
# abspath = os.path.abspath(__file__)  # 获取脚本文件的绝对路径
# dname = os.path.dirname(abspath)     # 提取脚本所在目录的路径
# os.chdir(dname)                      # 切换当前工作目录到脚本所在的目录

# 加载 CLIP 模型 ViT-L/14@336px
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device)

# 读取数据
df = pd.read_csv('./anchor.csv')

# 用于存储每个嵌入
embeddings = []

# 遍历每一行，读取 'semantic_description' 列
for index, row in df.iterrows():
    semantic_description = row['semantic_description']

    # 使用 CLIP 编码器对文本进行编码
    text_input = clip.tokenize([semantic_description]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_input)

    # 归一化文本的嵌入
    normalized_embedding = normalize(text_features.cpu().numpy(), axis=1)

    # 转换为 PyTorch tensor
    final_embedding_tensor = torch.tensor(normalized_embedding[0], dtype=torch.float32)

    # 保存嵌入
    embeddings.append(final_embedding_tensor)

# 将嵌入转换为 PyTorch 张量
embedding_tensor = torch.stack(embeddings)  # Shape: [num_samples, embedding_dim]

# 保存到 .pt 文件
torch.save(embedding_tensor, 'anchor_text_clip.pt')


In [4]:
import pandas as pd
import torch
import clip
from sklearn.preprocessing import normalize
import numpy as np
import json
import os

# 获取脚本所在的目录路径并切换当前工作目录
# abspath = os.path.abspath(__file__)  # 获取脚本文件的绝对路径
# dname = os.path.dirname(abspath)     # 提取脚本所在目录的路径
# os.chdir(dname)                      # 切换当前工作目录到脚本所在的目录

# 加载 CLIP 模型 ViT-L/14@336px
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device)

# 更新后的 urbanclip 模板
urbanclip_templates = [
    "{} area featuring {}.",
    "{} area featuring {} with cars.",
    "{} area featuring {} with parking lot.",
    "{} area featuring {} on the road.",
    "{} area featuring {} with many trees.",
    "{} area featuring {} in city."
]

# 读取 urban_taxonomy.json
with open('./urban_taxonomy.json', 'r') as f:
    urban_taxonomy = json.load(f)

# 用于存储每个嵌入
embeddings = []

# 遍历所有小类（功能类别）
for category, uots in urban_taxonomy.items():  # urban_taxonomy 是一个字典，key是功能分类名，value是该分类下的具体对象类型列表
    for uot in uots:
        # 每个模板生成一个句子，包含类别信息
        sentences = [template.format(category, uot) for template in urbanclip_templates]

        # 使用 CLIP 编码器对文本进行编码
        text_input = clip.tokenize(sentences).to(device)
        with torch.no_grad():
            text_features = model.encode_text(text_input)

        # 归一化每个句子的嵌入（参考 UrbanCLIP zeroshot.py 的处理思路）
        normalized_embeddings = normalize(text_features.cpu().numpy(), axis=1)

        # 计算6个句子的归一化嵌入的平均值
        avg_embedding = np.mean(normalized_embeddings, axis=0)

        # 归一化最终的平均嵌入
        final_embedding = normalize([avg_embedding])[0]

        # 转换为 PyTorch tensor
        final_embedding_tensor = torch.tensor(final_embedding, dtype=torch.float32)

        # 保存嵌入
        embeddings.append(final_embedding_tensor)

# 将嵌入转换为 PyTorch 张量
embedding_tensor = torch.stack(embeddings)  # Shape: [num_samples, embedding_dim]

# 保存到 .pt 文件
torch.save(embedding_tensor, 'predict_text_clip.pt')


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!unzip /content/drive/MyDrive/clip/Anchor.zip -d /content/anchor

Archive:  /content/drive/MyDrive/clip/Anchor.zip
  inflating: /content/anchor/1014.png  
  inflating: /content/anchor/1027.png  
  inflating: /content/anchor/1049.png  
  inflating: /content/anchor/1052.png  
  inflating: /content/anchor/1075.png  
  inflating: /content/anchor/1080.png  
  inflating: /content/anchor/1081.png  
  inflating: /content/anchor/1094.png  
  inflating: /content/anchor/1102.png  
  inflating: /content/anchor/1108.png  
  inflating: /content/anchor/111.png  
  inflating: /content/anchor/1111.png  
  inflating: /content/anchor/1116.png  
  inflating: /content/anchor/1117.png  
  inflating: /content/anchor/1136.png  
  inflating: /content/anchor/1140.png  
  inflating: /content/anchor/1143.png  
  inflating: /content/anchor/1162.png  
  inflating: /content/anchor/1198.png  
  inflating: /content/anchor/1209.png  
  inflating: /content/anchor/1216.png  
  inflating: /content/anchor/1220.png  
  inflating: /content/anchor/1246.png  
  inflating: /content/anchor/125

In [9]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm

# 获取脚本所在的目录路径并切换当前工作目录
# abspath = os.path.abspath(__file__)  # 获取脚本文件的绝对路径
# dname = os.path.dirname(abspath)     # 提取脚本所在目录的路径
# os.chdir(dname)                      # 切换当前工作目录到脚本所在的目录

# 加载 CLIP 模型 ViT-L/14@336px
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device)

# 定义特征提取函数 extract_features
def extract_features(image_folder, output_file):
    image_features_list = []
    # 读取 image_folder 中所有扩展名为 .jpg 或 .png 的文件，并将文件名按自然数顺序排序（例如，1.jpg、2.jpg、3.jpg 顺序排列）
    image_paths = sorted(
        [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.lower().endswith(('.jpg', '.png'))],
        key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
    )

    for image_path in tqdm(image_paths, desc=f"Processing {image_folder}"):
        try:
            # 加载并预处理图像
            image = Image.open(image_path)
            image_input = preprocess(image).unsqueeze(0).to(device)  # 使用 CLIP 的预处理方法

            # 获取图像特征
            with torch.no_grad():  # 禁用了梯度计算（不需要反向传播）
                image_features = model.encode_image(image_input)  # 使用 CLIP 的 encode_image 方法获取图像特征
                image_features /= image_features.norm(dim=-1, keepdim=True)  # 归一化处理
                image_features_list.append(image_features.cpu())  # 将特征向量移动到 CPU ，然后将其追加到列表中
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    # 将所有图像特征保存为矩阵
    if image_features_list:
        image_features = torch.cat(image_features_list, dim=0)  # 将存储在 image_features_list 中的所有图像特征向量拼接成一个大的张量
        torch.save(image_features, output_file)
        print(f"Features saved to {output_file}")
    else:
        print(f"No valid images found in {image_folder}")

# 调用特征提取函数，提取并保存每个城市的特征
extract_features('./anchor', 'anchor_image_clip.pt')

Processing ./anchor: 100%|██████████| 152/152 [00:04<00:00, 34.88it/s]

Features saved to anchor_image_clip.pt





In [10]:
!unzip /content/drive/MyDrive/clip/Predict.zip -d /content/predict


Archive:  /content/drive/MyDrive/clip/Predict.zip
  inflating: /content/predict/0.png  
  inflating: /content/predict/1.png  
  inflating: /content/predict/10.png  
  inflating: /content/predict/100.png  
  inflating: /content/predict/1000.png  
  inflating: /content/predict/1001.png  
  inflating: /content/predict/1002.png  
  inflating: /content/predict/1003.png  
  inflating: /content/predict/1004.png  
  inflating: /content/predict/1005.png  
  inflating: /content/predict/1006.png  
  inflating: /content/predict/1007.png  
  inflating: /content/predict/1008.png  
  inflating: /content/predict/1009.png  
  inflating: /content/predict/101.png  
  inflating: /content/predict/1010.png  
  inflating: /content/predict/1011.png  
  inflating: /content/predict/1012.png  
  inflating: /content/predict/1013.png  
  inflating: /content/predict/1015.png  
  inflating: /content/predict/1016.png  
  inflating: /content/predict/1017.png  
  inflating: /content/predict/1018.png  
  inflating: /con

In [11]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm

# 获取脚本所在的目录路径并切换当前工作目录
# abspath = os.path.abspath(__file__)  # 获取脚本文件的绝对路径
# dname = os.path.dirname(abspath)     # 提取脚本所在目录的路径
# os.chdir(dname)                      # 切换当前工作目录到脚本所在的目录

# 加载 CLIP 模型 ViT-L/14@336px
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device)

# 定义特征提取函数 extract_features
def extract_features(image_folder, output_file):
    image_features_list = []
    # 读取 image_folder 中所有扩展名为 .jpg 或 .png 的文件，并将文件名按自然数顺序排序（例如，1.jpg、2.jpg、3.jpg 顺序排列）
    image_paths = sorted(
        [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.lower().endswith(('.jpg', '.png'))],
        key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
    )

    for image_path in tqdm(image_paths, desc=f"Processing {image_folder}"):
        try:
            # 加载并预处理图像
            image = Image.open(image_path)
            image_input = preprocess(image).unsqueeze(0).to(device)  # 使用 CLIP 的预处理方法

            # 获取图像特征
            with torch.no_grad():  # 禁用了梯度计算（不需要反向传播）
                image_features = model.encode_image(image_input)  # 使用 CLIP 的 encode_image 方法获取图像特征
                image_features /= image_features.norm(dim=-1, keepdim=True)  # 归一化处理
                image_features_list.append(image_features.cpu())  # 将特征向量移动到 CPU ，然后将其追加到列表中
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    # 将所有图像特征保存为矩阵
    if image_features_list:
        image_features = torch.cat(image_features_list, dim=0)  # 将存储在 image_features_list 中的所有图像特征向量拼接成一个大的张量
        torch.save(image_features, output_file)
        print(f"Features saved to {output_file}")
    else:
        print(f"No valid images found in {image_folder}")

# 调用特征提取函数，提取并保存每个城市的特征
extract_features('./predict', 'predict_image_clip.pt')

Processing ./predict: 100%|██████████| 1366/1366 [00:37<00:00, 36.58it/s]

Features saved to predict_image_clip.pt



