<a href="https://colab.research.google.com/github/MoqiSheng/MoqiSheng.github.io/blob/main/2_TextEncoder0220.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import normalize
import json
import os

# 加载预训练的 Sentence Transformer 模型
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# 更新后的 urbanclip 模板
urbanclip_templates = [
    "{} area featuring {}.",
    "{} area featuring {} with cars.",
    "{} area featuring {} with parking lot.",
    "{} area featuring {} on the road.",
    "{} area featuring {} with many trees.",
    "{} area featuring {} in city."
]

# 读取 urban_taxonomy.json
with open('./urban_taxonomy.json', 'r') as f:
    urban_taxonomy = json.load(f)

# 读取 anchor_description.csv
anchor_df = pd.read_csv('./anchor_description.csv')

# 类别列表，按顺序定义
categories = [
    "residential", "commercial", "hotel",
    "industrial", "education", "health care", "civic, governmental and cultural",
    "sports and recreation", "outdoors and natural", "transportation"
]

# 创建保存嵌入的文件夹（如果不存在的话）
output_folder = './sentencetransformer'
os.makedirs(output_folder, exist_ok=True)

# 遍历每个类别并处理
for idx, category_name in enumerate(categories, start=1):  # 从1开始
    embeddings = []

    # 获取当前类对应的 UOT（功能对象类型）
    if category_name in urban_taxonomy:
        uots = urban_taxonomy[category_name]

        # 遍历所有小类（功能类别）
        for uot in uots:
            # 每个模板生成一个句子，包含类别信息
            sentences = [template.format(category_name, uot) for template in urbanclip_templates]

            # 编码模板句子
            sentence_embeddings = model.encode(sentences)

            # 归一化每个句子的嵌入（参考 UrbanCLIP zeroshot.py 的处理思路）
            normalized_embeddings = normalize(sentence_embeddings, axis=1)

            # 计算6个句子的归一化嵌入的平均值
            avg_embedding = np.mean(normalized_embeddings, axis=0)

            # 归一化最终的平均嵌入
            final_embedding = normalize([avg_embedding])[0]

            # 转换为 PyTorch tensor
            final_embedding_tensor = torch.tensor(final_embedding, dtype=torch.float32)

            # 保存嵌入
            embeddings.append(final_embedding_tensor)

    # 从 anchor_description.csv 中获取与当前类对应的语义描述
    anchor_rows = anchor_df[anchor_df['primary_function'] == idx]
    for _, row in anchor_rows.iterrows():
        semantic_description = row['semantic_description']
        # 编码描述句子
        description_embedding = model.encode([semantic_description])

        # 归一化嵌入
        normalized_description_embedding = normalize(description_embedding, axis=1)

        # 转换为 PyTorch tensor
        description_embedding_tensor = torch.tensor(normalized_description_embedding[0], dtype=torch.float32)

        # 保存到 embeddings 列表
        embeddings.append(description_embedding_tensor)

    # 将嵌入转换为 PyTorch 张量
    embedding_tensor = torch.stack(embeddings)  # Shape: [num_samples, embedding_dim]

    # 打印每个文件的形状
    print(f"Shape of the embedding tensor for {category_name}: {embedding_tensor.shape}")

    # 保存到 sentence-transformer 文件夹
    torch.save(embedding_tensor, os.path.join(output_folder, f'{category_name}.pt'))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Shape of the embedding tensor for residential: torch.Size([88, 768])
Shape of the embedding tensor for commercial: torch.Size([108, 768])
Shape of the embedding tensor for hotel: torch.Size([23, 768])
Shape of the embedding tensor for industrial: torch.Size([49, 768])
Shape of the embedding tensor for education: torch.Size([49, 768])
Shape of the embedding tensor for health care: torch.Size([29, 768])
Shape of the embedding tensor for civic, governmental and cultural: torch.Size([56, 768])
Shape of the embedding tensor for sports and recreation: torch.Size([29, 768])
Shape of the embedding tensor for outdoors and natural: torch.Size([43, 768])
Shape of the embedding tensor for transportation: torch.Size([32, 768])


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil

# 定义要压缩的文件夹路径和目标ZIP文件路径
folder_path = '/content/sentencetransformer'
zip_path = '/content/drive/MyDrive/2_TextEncoder0220/sentencetransformer.zip'

# 压缩文件夹为ZIP文件
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)

'/content/drive/MyDrive/2_TextEncoder0220/sentencetransformer.zip'