In [2]:
import torch, torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torchvision.models as tvm
import timm
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

device = "cuda" if torch.cuda.is_available() else "cpu"

### check the picture size if at least 100*100 pixels

In [3]:
from PIL import Image
# open the file
img = Image.open("Animals_with_Attributes2/JPEGImages/antelope/antelope_10002.jpg")  
width, height = img.size
print(f"picture width: {width}, and height: {height}")


picture width: 1024, and height: 768


# Part1 loading the dataset

In [4]:
#loading the ori. dataset(graph)

In [5]:
IMG = 224  #standard size
tfm = transforms.Compose([
    transforms.Resize((IMG, IMG)),
    transforms.ToTensor(),
    #for better using resnet as preprocessing part
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),   
])

In [6]:
rm -rf Animals_with_Attributes2/JPEGImages/.ipynb_checkpoints

In [7]:

#loading the whole dataset
import os
whole_ds = datasets.ImageFolder("Animals_with_Attributes2/JPEGImages", transform=tfm)
whole_dl = DataLoader(whole_ds, batch_size=64, shuffle=False, num_workers=4)

In [8]:
#check
print(len(whole_ds))       # numbers of pictures
print(len(whole_ds.classes)) 

37315
50


In [13]:
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2).to(device).eval()
feature_extractor = nn.Sequential(*list(resnet.children())[:-1]).to(device)

In [14]:
feats, labels = [], []
with torch.no_grad():
    for imgs, lbls in whole_dl:
        imgs = imgs.to(device, non_blocking=True)
        f = feature_extractor(imgs)              # [B, 2048, 1, 1]
        f = f.squeeze(-1).squeeze(-1).cpu()      # [B, 2048]
        feats.append(f)
        labels.append(lbls)

X_test = torch.cat(feats, 0)        # [N, 2048]  —— ResNet 特征
y_test = torch.cat(labels, 0)       # [N]        —— 标签(0~49)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

X_test: torch.Size([37305, 2048]) y_test: torch.Size([37305])


In [28]:
test_whole = torch.cat([X_test, y_test.unsqueeze(1)], dim=1)
print("test_whole.shape:", test_whole.shape)

test_whole.shape: torch.Size([37305, 2049])


In [40]:
test_whole

tensor([[0.0000e+00, 3.8338e-02, 0.0000e+00,  ..., 0.0000e+00, 1.6849e-03,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.2317e-01,
         4.9000e+01],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0546e-02, 0.0000e+00,
         4.9000e+01],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         4.9000e+01]])

In [37]:
import re

def read_title_and_desc_clean(txt_path):
    """读取 AwA2 licenses txt 文件中的 TITLE 和 DESCRIPTION 字段（去掉符号框线）"""
    title, desc = "", ""
    with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
        lines = [line.strip() for line in f if line.strip()]

    clean = lambda s: re.sub(r'[\+\-\|\_]+', '', s).strip()  # 删除 + - | _
    
    for i, line in enumerate(lines):
        if "TITLE" in line.upper() and i + 1 < len(lines):
            title = clean(lines[i + 1])
        if "DESCRIPTION" in line.upper():
            desc_lines = []
            for j in range(i + 1, len(lines)):
                if any(k in lines[j].upper() for k in ["TITLE", "INFO", "TAGS", "PHOTOGRAPHER", "LICENSE"]):
                    break
                desc_lines.append(clean(lines[j]))
            desc = " ".join(desc_lines).strip()
            break
    return title, desc


In [39]:
#check
txt_path = "Animals_with_Attributes2/licenses/antelope/antelope_10021.txt"
title, desc = read_title_and_desc_clean(txt_path)
print("TITLE:", title)
print("DESCRIPTION:", desc[:200], "...")

TITLE: 
DESCRIPTION: You are free to use this photo  (including commercial use) under attribution to the author. If being used online please add a link to <a href="http://ujora.de" rel="nofollow">ujora.de</a> Dieses Foto  ...


In [50]:
import pandas as pd
IMG_ROOT = "Animals_with_Attributes2/JPEGImages"
TXT_ROOT = "Animals_with_Attributes2/licenses"

# 用于与标签对齐（确保顺序和 y_test 的 0..49 一致）
class_to_idx = datasets.ImageFolder(IMG_ROOT).class_to_idx  # {'antelope':0, ...}
idx_to_class = {v: k for k, v in class_to_idx.items()}

# 读取并清洗单个 txt 的 TITLE 和 DESCRIPTION（去掉框线/下划线/HTML标签）
def read_title_and_desc_clean(p):
    title, desc = "", ""
    clean = lambda s: re.sub(r'[\+\-\|\_]+', '', s).strip()
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        lines = [line.strip() for line in f if line.strip()]

    for i, line in enumerate(lines):
        U = line.upper()
        if "TITLE" in U and i + 1 < len(lines):
            title = clean(lines[i + 1])
        if "DESCRIPTION" in U:
            buf = []
            for j in range(i + 1, len(lines)):
                if any(k in lines[j].upper() for k in ["TITLE","INFO","TAGS","PHOTOGRAPHER","LICENSE"]):
                    break
                buf.append(clean(lines[j]))
            desc = " ".join(buf).strip()
            break
    # 去掉 HTML 标签
    title = re.sub(r"<.*?>", "", title)
    desc  = re.sub(r"<.*?>", "", desc)
    return title, desc

# 按类别汇总：每个子文件夹 -> 拼接所有 txt 的 title+desc
rows = []
for cls in sorted(os.listdir(TXT_ROOT)):
    cls_dir = os.path.join(TXT_ROOT, cls)
    if not os.path.isdir(cls_dir):
        continue
    pieces = []
    for fname in sorted(os.listdir(cls_dir)):
        if fname.endswith(".txt"):
            t, d = read_title_and_desc_clean(os.path.join(cls_dir, fname))
            text = " ".join([t, d]).strip()
            if text:
                pieces.append(text)
    merged = " ".join(pieces)              # 该类别的整合文本
    rows.append({"class": cls, "text": merged, "n_txt": len(pieces)})

# 变成 DataFrame，并按 label 顺序(0..49) 排好，方便与 y_test 对齐
df = pd.DataFrame(rows)
df["label"] = df["class"].map(class_to_idx)
df = df.sort_values("label").reset_index(drop=True)
df = df.iloc[:-1].reset_index(drop=True)
print("行数(应为50):", len(df))
print(df[["label","class","n_txt"]].head())


行数(应为50): 50
   label       class  n_txt
0    0.0    antelope   1046
1    1.0         bat    178
2    2.0      beaver    147
3    3.0  blue+whale    174
4    4.0      bobcat    627


In [51]:
df.head()

Unnamed: 0,class,text,n_txt,label
0,antelope,"\And God said, Let the earth bring forth the l...",1046,0.0
1,bat,Found below the power lines at Hamilton Beach....,178,1.0
2,beaver,the local beavers on Christmas day 2007 (no de...,147,2.0
3,blue+whale,(no description) Free Fall breaching. (no desc...,174,3.0
4,bobcat,"One of the cubs walking in the enclosure, unde...",627,4.0


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 你已经有 df（50行），每行是一个类别的 text
texts = df["text"].fillna("").tolist()

# 1) 定义 TF-IDF 模型
tfidf = TfidfVectorizer(
    max_features=1000,       # 取前1000个高频特征，可调
    stop_words="english",    # 去除英文停用词
    lowercase=True           # 全部转小写
)

# 2) 拟合并变换
X_tfidf = tfidf.fit_transform(texts)      # shape (50, vocab_size)
print("TF-IDF shape:", X_tfidf.shape)

# 3) 转成 torch.Tensor，方便和你的 test_whole 拼接
X_tfidf_tensor = torch.tensor(X_tfidf.toarray(), dtype=torch.float32)



TF-IDF shape: (50, 1000)


In [53]:
X_tfidf_tensor



tensor([[0.0111, 0.0000, 0.0000,  ..., 0.0000, 0.0027, 0.0033],
        [0.0000, 0.0106, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0274, 0.0000],
        [0.0031, 0.0000, 0.0000,  ..., 0.0097, 0.0038, 0.0323],
        [0.0032, 0.0000, 0.0000,  ..., 0.0068, 0.0000, 0.0024]])

In [54]:
import torch

# 1) 拆出图片特征与标签
X_img = test_whole[:, :-1]       # [37305, 2048]
y_img = test_whole[:, -1].long() # [37305] 取整型标签

# 2) TF-IDF 向量 (50, 1000)
X_tfidf_tensor = X_tfidf_tensor  # 你上面生成的

# 3) 为每张图片找到对应类别的 TF-IDF
#    用标签直接索引即可（PyTorch 的广播机制会自动复制）
X_text = X_tfidf_tensor[y_img]   # [37305, 1000]

# 4) 拼接图像特征 + 文本特征
X_combined = torch.cat([X_img, X_text], dim=1)  # [37305, 2048+1000=3048]
print("融合后形状:", X_combined.shape)


融合后形状: torch.Size([37305, 3048])


In [55]:
X_combined

tensor([[0.0000, 0.0383, 0.0000,  ..., 0.0000, 0.0027, 0.0033],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0027, 0.0033],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0027, 0.0033],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0068, 0.0000, 0.0024],
        [0.0000, 0.0000, 0.0000,  ..., 0.0068, 0.0000, 0.0024],
        [0.0000, 0.0000, 0.0000,  ..., 0.0068, 0.0000, 0.0024]])

In [56]:
y_img

tensor([ 0,  0,  0,  ..., 49, 49, 49])

In [57]:
#k=10 10折交叉验证
import torch

# 你已有：
# X_combined: [N, D]  例如 3048 维（2048图像 + 1000文本）
# y_img:      [N]     标签 0..49
X = X_combined
y = y_img.long()

assert y.min().item() == 0 and y.max().item() == 49, "y 应为 0..49 的类别索引"

folds = {}
classes_per_fold = 5
num_classes = 50
num_folds = num_classes // classes_per_fold  # 10

for i in range(1, num_folds + 1):
    # 本折测试的类别（按标签顺序：0..4, 5..9, ...）
    test_classes = list(range((i-1)*classes_per_fold, i*classes_per_fold))
    test_mask  = torch.isin(y, torch.tensor(test_classes, dtype=torch.long))
    train_mask = ~test_mask

    X_test  = X[test_mask]
    y_test  = y[test_mask]
    X_trval = X[train_mask]
    y_trval = y[train_mask]

    folds[f"X_test_{i}"]         = X_test
    folds[f"y_test_{i}"]         = y_test
    folds[f"X_train&val_{i}"]    = X_trval
    folds[f"y_train&val_{i}"]    = y_trval
    folds[f"test_classes_{i}"]   = torch.tensor(test_classes)  # 记录本折的类

# 看一眼第1折
print("fold1:", folds["X_test_1"].shape, folds["y_test_1"].unique().tolist(),
      "| train&val:", folds["X_train&val_1"].shape)

# （可选）保存所有折
torch.save(folds, "awa2_classwise_10folds.pt")
print("✅ 已生成并保存 10 折（按类分组）")


fold1: torch.Size([2427, 3048]) [0, 1, 2, 3, 4] | train&val: torch.Size([34878, 3048])
✅ 已生成并保存 10 折（按类分组）


In [58]:
print("fold2:", folds["X_test_2"].shape, folds["y_test_2"].unique().tolist(),
      "| train&val:", folds["X_train&val_2"].shape, folds["y_train&val_2"].shape)

fold2: torch.Size([4556, 3048]) [5, 6, 7, 8, 9] | train&val: torch.Size([32749, 3048]) torch.Size([32749])


In [59]:
#update train&val set(about preprocessing that rotation, 高斯noise，....） for better performance 
#.......

In [62]:
#mlp（多层感知机）
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 假设你已有：
# folds: {"X_train&val_1":..., "y_train&val_1":..., "X_test_1":..., "y_test_1":..., "test_classes_1": tensor([0,1,2,3,4]), ...}
# X_tfidf_tensor: [50, V]  每个类别的 TF-IDF 语义向量（例如 V=1000）

S = X_tfidf_tensor.float()               # 语义矩阵 [50, V]
V = S.shape[1]                           # 语义维度（TF-IDF 特征数）
num_folds = 10
accs = []

for i in range(1, num_folds + 1):
    # 1) 取出本折的训练/测试数据
    Xtr_all = folds[f"X_train&val_{i}"]  # [Ntr, 3048] 你之前的“图像+文本”拼接
    ytr     = folds[f"y_train&val_{i}"].long()
    Xte_all = folds[f"X_test_{i}"]       # [Nte, 3048]
    yte     = folds[f"y_test_{i}"].long()
    unseen  = folds[f"test_classes_{i}"].long()  # 本折的 unseen 类标签（长度=5）

    # ✅ 关键：MLP 的输入只用图像特征（前 2048 维）
    Xtr = Xtr_all[:, :2048].float()
    Xte = Xte_all[:, :2048].float()

    # 训练目标：每个训练样本对应的类别语义向量（用 y 索引 S）
    Ytr = S[ytr]  # [Ntr, V]

    # 2) 定义一个很小的 MLP 做映射：2048 → V（如 1000）
    model = nn.Sequential(
        nn.Linear(2048, 1024),
        nn.ReLU(),
        nn.Linear(1024, V)
    )
    opt = optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # 3) 训练若干 epoch（可调）
    model.train()
    for _ in range(10):
        opt.zero_grad()
        pred = model(Xtr)       # [Ntr, V]
        loss = loss_fn(pred, Ytr)
        loss.backward()
        opt.step()

    # 4) Zero-shot 测试：与 unseen 类语义做相似度匹配
    model.eval()
    with torch.no_grad():
        pred_sem = model(Xte)                   # [Nte, V]
        # 用余弦相似度更稳：先 L2 归一化
        pred_sem  = F.normalize(pred_sem, dim=1)
        unseen_S  = F.normalize(S[unseen], dim=1)   # [5, V]
        sims      = pred_sem @ unseen_S.T          # [Nte, 5]
        pred_lbl  = unseen[sims.argmax(dim=1)]     # 取相似度最大的 unseen 类
        acc       = (pred_lbl == yte).float().mean().item()
        accs.append(acc)
        print(f"Fold {i}: acc={acc:.4f}")

print("Mean acc:", sum(accs)/len(accs))


Fold 1: acc=0.4314
Fold 2: acc=0.2935
Fold 3: acc=0.2286
Fold 4: acc=0.0856
Fold 5: acc=0.4046
Fold 6: acc=0.3735
Fold 7: acc=0.1940
Fold 8: acc=0.2245
Fold 9: acc=0.1259
Fold 10: acc=0.4886
Mean acc: 0.28501834720373154


In [63]:
#调超参数
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import itertools, random

# ===== 前提：你已经准备好 =====
# folds: dict，包含每折的数据：
#   "X_train&val_i", "y_train&val_i", "X_test_i", "y_test_i", "test_classes_i"
# S: [50, V]  每个类别的语义向量（TF-IDF），且已为 float32
S = X_tfidf_tensor.float() if 'X_tfidf_tensor' in globals() else S
V = S.shape[1]
num_outer = 10
k_inner = 3

# ===== 超参数网格（每个3个取值，符合要求）=====
param_grid = {
    "hidden_dim": [512, 1024, 2048],     # 低/中/高
    "lr":         [1e-3, 5e-4, 1e-4],    # 低/中/高
}
param_list = list(itertools.product(*param_grid.values()))
print(f"Param combos = {len(param_list)}  -> {param_list}")

Param combos = 9  -> [(512, 0.001), (512, 0.0005), (512, 0.0001), (1024, 0.001), (1024, 0.0005), (1024, 0.0001), (2048, 0.001), (2048, 0.0005), (2048, 0.0001)]


In [64]:
def build_model(hidden_dim):
    """MLP: 2048 -> hidden_dim -> V"""
    return nn.Sequential(
        nn.Linear(2048, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, V)
    )

def train_epoch(model, opt, Xtr, Ytr, loss_fn):
    opt.zero_grad()
    pred = model(Xtr)
    loss = loss_fn(pred, Ytr)
    loss.backward()
    opt.step()
    return loss.item()

def zsl_accuracy(model, X, y_true, candidate_classes, S):
    """
    Zero-shot 评估：把 X 映射到语义空间，与 candidate_classes 的类原型做余弦匹配。
    candidate_classes: 1D tensor（内层为 val 类；外层为 unseen 类）
    """
    with torch.no_grad():
        pred_sem = F.normalize(model(X), dim=1)               # [N, V]
        proto    = F.normalize(S[candidate_classes], dim=1)   # [C, V]
        sims     = pred_sem @ proto.T                         # [N, C]
        pred_lbl = candidate_classes[sims.argmax(dim=1)]
        acc      = (pred_lbl == y_true).float().mean().item()
    return acc

In [67]:
#调参数

In [None]:
#rnn

In [None]:
# f-CLSWGAN