In [1]:
import numpy as np
import os
import shutil
from random import shuffle
from tqdm import tqdm

def pre_process(train_path,data_path,n=1250,ratio=0.2):
    #检查data_path 目录是否存在，如果存在，则删除，重新建立新的目录
    if os.path.exists(data_path):
        shutil.rmtree(data_path,True)
    os.mkdir(data_path)
    
    for name in ['cats','dogs']:
        os.makedirs('{}/train/{}/'.format(data_path,name))
        os.makedirs('{}/validation/{}/'.format(data_path,name))
    
    #获取文件名，并打乱顺序，为后期随机采样做准备
    filenames = os.listdir(train_path)
    shuffle(filenames)
    
    cat_files = list(filter(lambda x:x[:3] == 'cat', filenames))
    dog_files = list(filter(lambda x:x[:3] == 'dog', filenames))
   
    # m 为训练集的dogs 或者 cats的数量，ratio 为 验证集占数据集n的比例
    m = int(n*(1-ratio)) /2 
   
    for i in tqdm(range(int(n/2))):
        if i < m :
            shutil.copyfile('{}/{}'.format(train_path,cat_files[i]),'{}/train/cats/{}'.format(data_path,cat_files[i]))
            shutil.copyfile('{}/{}'.format(train_path,dog_files[i]),'{}/train/dogs/{}'.format(data_path,dog_files[i]))
        else:
            shutil.copyfile('{}/{}'.format(train_path,cat_files[i]),'{}/validation/cats/{}'.format(data_path,cat_files[i]))
            shutil.copyfile('{}/{}'.format(train_path,dog_files[i]),'{}/validation/dogs/{}'.format(data_path,dog_files[i]))
            
def pre_test(test_path,test_gen_path,n=1000):
    
    if os.path.exists(test_gen_path):
        shutil.rmtree(test_gen_path,True)
    os.mkdir(test_gen_path)
    os.mkdir('{}/test'.format(test_gen_path))
    
    test_files = os.listdir(test_path)
    test_files = test_files[:n]
    
    for file in tqdm(test_files):
        shutil.copyfile('{}/{}'.format(test_path,file),'{}/test/{}'.format(test_gen_path,file))

In [2]:
pre_process('train','data_gen',n = 25000,ratio = 0.2)

100%|██████████| 12500/12500 [02:53<00:00, 72.17it/s]


In [4]:
pre_test('test','test_gen',n = 12500)

100%|██████████| 12500/12500 [00:47<00:00, 261.17it/s]


In [5]:
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.layers import Dense,Input,GlobalAveragePooling2D,Dropout,Lambda
from keras.optimizers import SGD,RMSprop
from keras.applications.xception import Xception
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.image import ImageDataGenerator

from keras.applications import xception,inception_v3


import h5py

def gap_pred(MODEL, image_size, pool=None,lambda_func=None):
    width = image_size[0]
    height = image_size[1]
    input_tensor = Input((height, width, 3))
    x = input_tensor
    
    # 根据不同的模型，进行数据预处理
    if lambda_func:
        x = Lambda(lambda_func)(x)
        
    #导入预训练模型，并导入权重   
    model = MODEL(input_tensor=x, weights='imagenet', include_top=False,pooling = pool)

    #通过generator生成数据
    image_gen = ImageDataGenerator()
    train_gen = image_gen.flow_from_directory("data_gen/train", image_size, shuffle=False, batch_size=32)
    valid_gen = image_gen.flow_from_directory('data_gen/validation',image_size,shuffle = False,batch_size = 32)
    test_gen  = image_gen.flow_from_directory("test_gen", image_size, shuffle=False, batch_size=32, class_mode=None)

    #通过predict 导出特征向量
    train = model.predict_generator(train_gen)
    valid = model.predict_generator(valid_gen)
    test = model.predict_generator(test_gen)
    
    #将特征向量保存为h5文件
    with h5py.File("gap_pred_%s.h5"%model.name) as h:
        h.create_dataset("train", data=train)
        h.create_dataset("valid",data=valid)
        h.create_dataset("test", data=test)

        h.create_dataset("train_label", data=train_gen.classes)
        h.create_dataset("valid_label",data = valid_gen.classes)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  (fname, cnt))
  (fname, cnt))


In [6]:
gap_pred(ResNet50,(224,224),pool='avg')

Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [7]:
gap_pred(Xception,(299,299),pool='avg',lambda_func=xception.preprocess_input)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [8]:
gap_pred(InceptionV3,(299,299),pool = 'avg',lambda_func=inception_v3.preprocess_input)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.
