In [24]:
import h5py
import pickle
import numpy as np
import os
from datetime import datetime


In [26]:
def get_img_order(entity2id_file, imageindex_file, img_order_file):
    img_order = []
    
    with open(entity2id_file, 'r', encoding='utf-8') as f:
        for line in f:
            entity_id = line.strip().split()[0]
            
            with open(imageindex_file, 'r', encoding='utf-8') as img_file:
                found = False
                for img_line in img_file:
                    img_id, img_url = img_line.strip().split('\t')
                    if entity_id == img_id:
                        img_order.append(img_url)
                        found = True
                        break
                
                if not found:
                    img_order.append('ERROR: ' + entity_id + ' not found in ' + imageindex_file)
    
    with open(img_order_file, 'w', encoding='utf-8') as f:
        for img_url in img_order:
            f.write(img_url + '\n')
        print('图像顺序已保存到' + img_order_file)

In [17]:
get_img_order('../IMF-Pytorch/datasets/DB15K/entity2id.txt', '../mmkb-0.2/DB15K/DB15K_ImageIndex.txt', 'DB15K/img_order.txt')


In [3]:
def list_h5_datasets(h5_file):
    with h5py.File(h5_file, 'r') as file:
        datasets = list(file.keys())
    return datasets


h5_file = '../mmkb-0.2/DB15K_ImageData.h5'
datasets = list_h5_datasets(h5_file)
print(datasets)

['DBIMG00001', 'DBIMG00002', 'DBIMG00003', 'DBIMG00004', 'DBIMG00005', 'DBIMG00006', 'DBIMG00007', 'DBIMG00008', 'DBIMG00009', 'DBIMG00010', 'DBIMG00011', 'DBIMG00012', 'DBIMG00013', 'DBIMG00014', 'DBIMG00015', 'DBIMG00016', 'DBIMG00017', 'DBIMG00018', 'DBIMG00019', 'DBIMG00020', 'DBIMG00021', 'DBIMG00022', 'DBIMG00023', 'DBIMG00024', 'DBIMG00025', 'DBIMG00026', 'DBIMG00027', 'DBIMG00028', 'DBIMG00029', 'DBIMG00030', 'DBIMG00031', 'DBIMG00032', 'DBIMG00033', 'DBIMG00034', 'DBIMG00035', 'DBIMG00036', 'DBIMG00037', 'DBIMG00038', 'DBIMG00039', 'DBIMG00040', 'DBIMG00041', 'DBIMG00042', 'DBIMG00043', 'DBIMG00044', 'DBIMG00045', 'DBIMG00046', 'DBIMG00047', 'DBIMG00048', 'DBIMG00049', 'DBIMG00050', 'DBIMG00051', 'DBIMG00052', 'DBIMG00053', 'DBIMG00054', 'DBIMG00055', 'DBIMG00056', 'DBIMG00057', 'DBIMG00058', 'DBIMG00059', 'DBIMG00060', 'DBIMG00061', 'DBIMG00062', 'DBIMG00063', 'DBIMG00064', 'DBIMG00065', 'DBIMG00066', 'DBIMG00067', 'DBIMG00068', 'DBIMG00069', 'DBIMG00070', 'DBIMG00071', 'DBIM

In [4]:
def convert_h5_to_pkl(h5_file, pkl_file):
    with h5py.File(h5_file, 'r') as file:
        data = file['FBIMG00001'][:]
    with open(pkl_file, 'wb') as file:
        pickle.dump(data, file)


In [21]:
def merge_h5_to_pkl(h5_file, pkl_file, img_order_file):
    with open(img_order_file, 'r', encoding='utf-8') as f:
        img_order = [line.strip() for line in f]
    
    with h5py.File(h5_file, 'r') as file:
        datasets = list(file.keys())
        data = []
        total_datasets = len(img_order)
        
        for i, dataset in enumerate(img_order):
            if dataset.startswith('ERROR:') or dataset not in datasets:
                data.append(np.zeros(4096).tolist())
                continue
            
            data.append(np.squeeze(file[dataset][:]).tolist())
            
            if (i + 1) % 100 == 0:
                timestamp = datetime.now().strftime("[%H:%M:%S]")
                print(f"{timestamp} 已转换{i + 1}/{total_datasets}个数据集，当前数据集：{dataset}")
    
    with open(pkl_file, 'wb') as file:
        timestamp = datetime.now().strftime("[%H:%M:%S]")
        pickle.dump(data, file)
        print(f"{timestamp} {pkl_file} 保存成功")

In [28]:
dataset_name = 'YAGO15K'
h5_file = f'../mmkb-0.2/{dataset_name}_ImageData.h5'
pkl_file = f'{dataset_name}/img_features.pkl'
entity2id_file = f'../IMF-Pytorch/datasets/{dataset_name}/entity2id.txt'
imageindex_file = f'../mmkb-0.2/{dataset_name}/{dataset_name}_ImageIndex.txt'
img_order_file = f'{dataset_name}/img_order.txt'

if not os.path.exists(img_order_file):
    get_img_order(entity2id_file, imageindex_file, img_order_file)

merge_h5_to_pkl(h5_file, pkl_file, img_order_file)


图像顺序已保存到YAGO15K/img_order.txt
[21:37:19] 已转换100/15404个数据集，当前数据集：YAGOIMG04277
[21:37:19] 已转换200/15404个数据集，当前数据集：YAGOIMG01075
[21:37:19] 已转换300/15404个数据集，当前数据集：YAGOIMG08930
[21:37:19] 已转换500/15404个数据集，当前数据集：YAGOIMG02337
[21:37:19] 已转换700/15404个数据集，当前数据集：YAGOIMG06384
[21:37:19] 已转换900/15404个数据集，当前数据集：YAGOIMG09342
[21:37:20] 已转换1000/15404个数据集，当前数据集：YAGOIMG04101
[21:37:20] 已转换1300/15404个数据集，当前数据集：YAGOIMG01475
[21:37:20] 已转换1400/15404个数据集，当前数据集：YAGOIMG01894
[21:37:20] 已转换1500/15404个数据集，当前数据集：YAGOIMG09169
[21:37:20] 已转换1600/15404个数据集，当前数据集：YAGOIMG05597
[21:37:20] 已转换1700/15404个数据集，当前数据集：YAGOIMG00825
[21:37:20] 已转换1800/15404个数据集，当前数据集：YAGOIMG10351
[21:37:20] 已转换1900/15404个数据集，当前数据集：YAGOIMG07595
[21:37:20] 已转换2000/15404个数据集，当前数据集：YAGOIMG06802
[21:37:20] 已转换2100/15404个数据集，当前数据集：YAGOIMG03271
[21:37:21] 已转换2300/15404个数据集，当前数据集：YAGOIMG02460
[21:37:21] 已转换2400/15404个数据集，当前数据集：YAGOIMG05440
[21:37:21] 已转换2500/15404个数据集，当前数据集：YAGOIMG03983
[21:37:21] 已转换2600/15404个数据集，当前数据集：YAGOIMG10757
[21:37:21] 已转换27