In [1]:
import os
import shutil
from glob import glob
from PIL import Image,ImageOps
from IPython.display import display
import matplotlib.pyplot as plt

In [2]:
#确定数据集目录存在
all_dataset_path=r".\Dataset"
assert os.path.exists(all_dataset_path)

#制作保存目录
post_dataset_path=os.path.join(all_dataset_path,"After")
if not os.path.exists(post_dataset_path):
    os.mkdir(post_dataset_path)
    
sub_0=os.path.join(post_dataset_path,"Other")
sub_1=os.path.join(post_dataset_path,"こぶいち")
sub_2=os.path.join(post_dataset_path,"むりりん")
sub_3=os.path.join(post_dataset_path,"こもわた遥華")
stderr=os.path.join(post_dataset_path,"OtherResolution")
output_dir=[sub_0,sub_1,sub_2,sub_3,stderr]
for path in output_dir:
    if not os.path.exists(path):
        os.mkdir(path)

size=(640,360)
keep_differ=False

In [3]:
def filter_fn(path,image=None):
    # 保留16:9 CG以及SD(5:3比例，但以文件名判断) 
    if image is None:
        img=Image.open(path)
    else:
        img=image
    width,height=img.size
    if (width*9 == height*16):
        return True
    else:
        p=path.rfind('\\')
        if (path[p+1:p+3] == "SD"):
            return True
        
    return False

def pre_select_fn(paths,filter_fn=None,stderr=None):
    #选出符合filter_fn的图像，
    #并将不同长宽比的图像移动到stderr
    if filter_fn is None:
        return paths
    
    accepted=[]
    for path in paths:
        if filter_fn(path):
            accepted.append(path)
        elif stderr is not None:
            shutil.move(path,stderr)
    
    return accepted

def CG_process_fn(paths,dest,override=False,name_header=None):
    #将图片resize后以jpg格式另存在dest目录下
    #如果有name_header则加上该前缀
    if dest is None: return
    
    for path in paths:
        name=os.path.split(path)[1]
        name,extension=os.path.splitext(name)
            
        if name_header:
            full_name=os.path.join(dest,name_header)+'_'+name+'.jpg'
        else:
            full_name=os.path.join(dest,name)+'.jpg'
            
        if (not os.path.exists(full_name)) or override:
            img=Image.open(path)
            img=img.resize(size,resample=Image.ANTIALIAS)
            if (extension != '.jpg'):
                img=img.convert('RGB')
            img.save(full_name)
            
def get_output(inputs,hash_table,pre_hash=None,path_hash=False):
    #将输入依次经过pre_hash，hash_table,path_hash后获得输出目录
    #path_hash为False时返回下表，True时直接返回目录
    if pre_hash:
        inputs=pre_hash[inputs]
        
    if (inputs in hash_table.keys()):
        re=hash_table[inputs]
    else:
        re=hash_table["Other"]
        
    if path_hash:
        assert output_dir
        re=output_dir[re]
        
    return re

In [4]:
def shrink(img,size,upper=True):
    #upper模式将长宽都缩小到size以下，
    #非upper模式至少将一个缩小到size以下
    if type(size) is int:
        size=(size,size)
        
    width,height=img.size
    w,h=size
    if (width < w) and (height < h) and upper:
        return img
    
    if (width/w > height/h) ^ upper:
        #缩小height到边界
        w=width*h//height
    else:
        #缩小width到边界
        h=height*w//width
    
    return img.resize([w,h],resample=Image.ANTIALIAS)

def expand(img,size):
    #将图像用白色填充到size
    if type(size) is int:
        size=(size,size)
    return ImageOps.pad(img,size,method=Image.ANTIALIAS,color='white')

def stand_process_fn(paths,dest,override=False,name_header=None,data_augs=[]):
    #将图片resize后另存在dest目录下
    #侧重点在处理和数据加强，速度较慢
    if dest is None: return
    
    for path in paths:
        img=Image.open(path)
        img=expand(img,size)
        name=os.path.split(path)[1]
        name,extension=os.path.splitext(name)
        
        extension='.jpg'
        img=img.convert('RGB')
        if name_header:
            name=os.path.join(dest,name_header)+'_'+name
        else:
            name=os.path.join(dest,name)
            
        full_name=name+extension
        if (not os.path.exists(full_name)) or override:
            img.save(full_name)
        
        if ('mirror' in data_augs):
            full_name=name+'_mirror'+extension
            if (not os.path.exists(full_name)) or override:
                temp=ImageOps.mirror(img)
                temp.save(full_name)
            

In [5]:
hash_output={"こぶいち":1,
             "むりりん":2,
             "こもわた遥華":3,
             "Other":0}

hash_RJ={'ev1':"むりりん",#三司绫濑
         'ev2':"こぶいち",#在原七海
         'ev3':"むりりん",#式部茉优 
         'ev4':"こぶいち",#二条院羽月 
         'SD':"こもわた遥華"}

hash_Seren={'ev1':"こぶいち",#朝武芳乃
            'ev2':"むりりん",#常陆茉子
            'ev3':"むりりん",#丛雨
            'ev4':"こぶいち",#蕾娜
            'SD':"こもわた遥華"}

hash_witch={'宁宁':"むりりん",
            '美咕噜':"むりりん",
            '紬':"こぶいち",
            '憧子':"こぶいち",
            'SD':"こもわた遥華"
}

In [6]:
def RJ_parse_fn(paths):
    pre=os.path.join(all_dataset_path,'RJ','*')
    for head in hash_RJ.keys():
        name=os.path.join(pre,head)
        name+='*.jpg'
        print(name)
        #剔除差分
        if not keep_differ:
            items=[]
            exist=set({})
            for item in glob(name):
                identity=os.path.split(item)[1][:5]
                if not(identity in exist):
                    exist.add(identity)
                    items.append(item)
                    continue
        
        dest=get_output(head,pre_hash=hash_RJ,hash_table=hash_output,path_hash=True)
        CG_process_fn(items,dest,name_header="RJ")
        
RJ_path=os.path.join(all_dataset_path,'RJ\*\*.jpg')
print(RJ_path)
RJ_paths=glob(RJ_path)
RJ_paths=pre_select_fn(RJ_paths,filter_fn=filter_fn,stderr=stderr)

RJ_parse_fn(RJ_paths)

.\Dataset\RJ\*\*.jpg
.\Dataset\RJ\*\ev1*.jpg
.\Dataset\RJ\*\ev2*.jpg
.\Dataset\RJ\*\ev3*.jpg
.\Dataset\RJ\*\ev4*.jpg
.\Dataset\RJ\*\SD*.jpg


In [7]:
def Seren_parse_fn(paths):
    pre=os.path.join(all_dataset_path,'千恋万花','*')
    extension=os.path.splitext(paths[0])[1]
    for head in hash_Seren.keys():
        name=os.path.join(pre,head)
        name=name+'*'+ extension
        print(name)
        #剔除差分
        if not keep_differ:
            items=[]
            exist=set({})
            for item in glob(name):
                identity=os.path.split(item)[1][:5]
                if not(identity in exist):
                    exist.add(identity)
                    items.append(item)
                    continue
        
        dest=get_output(head,pre_hash=hash_Seren,hash_table=hash_output,path_hash=True)
        CG_process_fn(items,dest,name_header="Seren")

Seren_path=os.path.join(all_dataset_path,'千恋万花\*\*.png')
print(Seren_path)
Seren_paths=glob(Seren_path)
Seren_paths=pre_select_fn(Seren_paths,filter_fn=filter_fn,stderr=stderr)

Seren_parse_fn(Seren_paths)

.\Dataset\千恋万花\*\*.png
.\Dataset\千恋万花\*\ev1*.png
.\Dataset\千恋万花\*\ev2*.png
.\Dataset\千恋万花\*\ev3*.png
.\Dataset\千恋万花\*\ev4*.png
.\Dataset\千恋万花\*\SD*.png


In [8]:
def witch_parse_fn(paths):
    dest=None
    
    for path in paths:
        name=os.path.split(path)[1]
        if name >= "Sanoba_1439.jpg":
            dest=get_output('SD',pre_hash=hash_witch,hash_table=hash_output,path_hash=True)
        elif name >= "Sanoba_1245.jpg":
            dest=None
        elif name >= "Sanoba_0746.jpg":
            dest=get_output('紬',pre_hash=hash_witch,hash_table=hash_output,path_hash=True)
        elif name >= "Sanoba_0940.jpg":
            dest=get_output('憧子',pre_hash=hash_witch,hash_table=hash_output,path_hash=True)
        elif name >= "Sanoba_0427.jpg":
            dest=get_output('美咕噜',pre_hash=hash_witch,hash_table=hash_output,path_hash=True)
        elif name >= "Sanoba_0015.jpg":
            dest=get_output('宁宁',pre_hash=hash_witch,hash_table=hash_output,path_hash=True)
            
        CG_process_fn([path],dest,name_header=None)

witch_path=os.path.join(all_dataset_path,'魔女的夜宴\*\*.jpg')
print(witch_path)
witch_paths=glob(witch_path)
# witch_CG_paths=pre_select_fn(witch_paths,filter_fn=filter_fn,stderr=None)

witch_parse_fn(witch_paths)

.\Dataset\魔女的夜宴\*\*.jpg


In [9]:
#清除一些图片（Keep_differ从True到Dalse时课使用）
reset=False
if reset:
    path=os.path.join(post_dataset_path,'*','Sanoba*')
    paths=glob(path)
    for path in paths:
        os.remove(path)

In [10]:
#加入其他图片
folders=glob(os.path.join(all_dataset_path,"Other",'*'))
folders=folders[0:3]
# print(folders)
dest=get_output('Other',hash_output,path_hash=True)
print(dest)
for folder in folders:
    print(folder)
    items=glob(os.path.join(folder,'*'))
    CG_process_fn(items,dest)

.\Dataset\After\Other
.\Dataset\Other\MA2立绘
.\Dataset\Other\Saru
.\Dataset\Other\SD


In [11]:
#加入纯色图片
dest=get_output('Other',hash_output,path_hash=True)
names=['white','grey','black','red','orange','yellow','green','blue','purple']
for color in names:
    img=Image.new('RGB',size,color)
    img.save(os.path.join(dest,color)+'.jpg')

In [12]:
#对于最后数据集中不同分辨率的图片记性缩放和填充
names=glob(os.path.join(post_dataset_path,'*/*'))

cnt=0
for name in names:
    img=Image.open(name)
    if img.size != size:
        img=ImageOps.pad(img,size,method=Image.ANTIALIAS,color='white')
        img.save(name)
        cnt+=1
        print('\rFound %d' % cnt,end='')