In [12]:
import pandas as pd
import torch
from torch import tensor
import numpy as np
import re
import utils
import pymongo
from pymongo.collection import Collection
import os
import shutil
from datetime import datetime

该ipynb文件可以结合作业的image_dataset,datas_v2.csv和utils文件, 生成以图片的md5为文件名的数据集和以及mongodb数据库wsm

In [22]:
def str_2_feature_numpy(feature_str:str) -> "np.ndarray":
    feature_str = re.sub('\s+',',',feature_str)
    feature_str = re.sub('\[,','[',feature_str)
    return np.array(eval(feature_str),dtype=np.float32)

In [23]:
def import_single_image(image_path: str, feature: np.ndarray,
                        config: dict, mongo_collection: Collection, copy=False):
    """
        一张新图片地址, 和由模型生成的feature存入指定的mongo_collection中
    """
    image_type = utils.get_file_type(image_path)
    if image_type is None:
        print("skip file:", image_path)
        return

    image_size = utils.get_image_size(image_path)

    feature = feature.astype(config['storage-type']) # float32

    if copy:
        md5hash = utils.calc_md5(image_path)
        new_basename = md5hash + '.' + image_type # new name
        new_full_path = utils.get_full_path(config['import-image-base'], new_basename)
        if os.path.isfile(new_full_path):
            print("duplicate file:", image_path)
            return
        
        shutil.copy2(image_path, new_full_path)
        stat = os.stat(new_full_path)
    else:
        stat = os.stat(image_path)
        new_full_path = image_path

    image_mtime = datetime.fromtimestamp(stat.st_mtime)
    image_datestr = image_mtime.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

    # save to mongodb
    document = {
        'filename': new_full_path, # md5 name
        'extension': image_type,
        'height': image_size[1],
        'width': image_size[0],
        'filesize': stat.st_size,
        'date': image_datestr,
        'feature': feature.tobytes() # feature 存储的是byte格式
    }

    x = mongo_collection.insert_one(document)
    return x

In [4]:
df = pd.read_csv("datas_v2.csv")

In [5]:
client = pymongo.MongoClient()

In [6]:
images_collection = client.wsm.images

In [7]:
image_raw_set_path = "../image_dataset/"
def path_feature_generator(df):
    base_path = image_raw_set_path
    for i in df.iloc:
        yield base_path+i[0], str_2_feature_numpy(i[1])

In [8]:
config = utils.get_config()

In [24]:
pf_generator = path_feature_generator(df)

In [25]:
for p,f in pf_generator:
    import_single_image(p,f,config,images_collection,True)

duplicate file: ../image_dataset/000001x2.png
duplicate file: ../image_dataset/000002x2.png


In [26]:
images_collection.count_documents({})

3077

导出数据库的集合 wsm.images

In [28]:
!mongodump --host localhost:27017 --db wsm --collection images --out imagesdb.dump
# !开头可以直接在ipynb中执行shell命令

2023-05-28T14:12:23.943+0800	writing wsm.images to imagesdb.dump/wsm/images.bson
2023-05-28T14:12:23.951+0800	done dumping wsm.images (3077 documents)


如果要将数据库加载到某个mongodb数据库, 使用下面的命令

In [None]:
!mongorestore --host <hostname>:<port> --db wsm <directory or file to restore>