# 导出特征向量

In [8]:
from keras.models import Model
from keras.layers import Input, GlobalAveragePooling2D
from keras.applications.resnet50 import ResNet50, decode_predictions
from keras.applications import *
import cv2, os, itertools
import numpy as np
from sklearn.utils import shuffle


from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *


In [2]:
TRAIN_DIR = 'data/train/'
TEST_DIR = 'data/test/'

train_dir = [TRAIN_DIR + i for i in os.listdir(TRAIN_DIR)]
# train_dog_dir = [TRAIN_DIR + i for i in os.listdir(TRAIN_DIR) if 'dog' in i]
# train_cat_dir = [TRAIN_DIR + i for i in os.listdir(TRAIN_DIR) if 'cat' in i]
test_dir = [TEST_DIR + i for i in os.listdir(TEST_DIR)]

# train_dir = train_dir[:50]
# test_dir = test_dir[:50]
train_dir.sort()
test_dir.sort()

# save feature vector to file

In [None]:

concat_train_feature_vector = np.ndarray((len(train_dir), 2048 + 2048 + 512 + 512), dtype = np.float32)
concat_test_feature_vector = np.ndarray((len(test_dir), 2048 + 2048 + 512 + 512), dtype = np.float32)

def get_feature_vector_list(image_path_list, MODEL, input_size, preprocess_fun = None):
    inputs = Input(input_size)
    #注意: 预处理函数对于Xception和InceptionV3都是必须的, 否则会预测出错
    if preprocess_fun:
        inputs = Lambda(preprocess_fun)(inputs)
        
    #❓我的组合模型为什么错了
#     x = MODEL(input_tensor = inputs, include_top = False).output
#     outputs = GlobalAveragePooling2D()(x)
#     model = Model(inputs = inputs, outputs = outputs)

    #别人的正确模型
    base_model = MODEL(input_tensor = inputs, weights = 'imagenet', include_top = False)
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))

    #不修改的模型
#     model = MODEL(input_tensor = inputs, weights='imagenet')

    print('start {} prediction:'.format(base_model.name))
    feature_vector_list = []
    for i, image_path in enumerate(image_path_list):
        input_image = prepare_data(image_path, input_size)
        input_image = np.expand_dims(input_image, axis = 0)
        feature_vector = model.predict(input_image, verbose = 0)
        feature_vector_list.append(feature_vector)
        #显示进度条
        if i%(len(image_path_list)//100) == 0:
            print('>', end = '')
    print('finish {} prediction'.format(base_model.name))
    
    return feature_vector_list

        
def prepare_data(image_path, input_size):
    rows = input_size[0]
    cols = input_size[1]
    channels = input_size[2]
    data = np.ndarray(input_size, dtype = np.uint8)
    
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (rows, cols), interpolation = cv2.INTER_CUBIC)
    
    return image


def get_and_save_feature_vector():
    #ResNet50输出(1, 2048)
    ResNet50_train_feature_vector_list = get_feature_vector_list(train_dir, ResNet50, (224, 224, 3))
    #Xception输出(1, 2048)
    Xception_train_feature_vector_list = get_feature_vector_list(train_dir, Xception, (299, 299, 3), xception.preprocess_input)
    #TODO: 修正InceptionV3模型的错误使用
    # get_feature_vector(image_path, InceptionV3, (299, 299, 3), inception_v3.preprocess_input)
    #VGG16输出(1, 512)
    VGG16_train_feature_vector_list = get_feature_vector_list(train_dir, VGG16, (224, 224, 3))
    #VGG19输出(1, 512)
    VGG19_train_feature_vector_list = get_feature_vector_list(train_dir, VGG19, (224, 224, 3))
    for i in range(len(train_dir)):
        concat_train_feature_vector[i] = np.concatenate([ResNet50_train_feature_vector_list[i], 
                                                        Xception_train_feature_vector_list[i],
                                                        VGG16_train_feature_vector_list[i],
                                                        VGG19_train_feature_vector_list[i]], axis=1)
    
    ResNet50_test_feature_vector_list = get_feature_vector_list(test_dir, ResNet50, (224, 224, 3))
    Xception_test_feature_vector_list = get_feature_vector_list(test_dir, Xception, (299, 299, 3), xception.preprocess_input)
    VGG16_test_feature_vector_list = get_feature_vector_list(test_dir, VGG16, (224, 224, 3))
    VGG19_test_feature_vector_list = get_feature_vector_list(test_dir, VGG19, (224, 224, 3))
    for i in range(len(test_dir)):
        concat_test_feature_vector[i] = np.concatenate([ResNet50_test_feature_vector_list[i], 
                                                        Xception_test_feature_vector_list[i],
                                                        VGG16_test_feature_vector_list[i],
                                                        VGG19_test_feature_vector_list[i]], axis=1)
    
    np.savetxt("concat_train_feature_vector.npy", concat_train_feature_vector, delimiter = ',')
    np.savetxt("concat_test_feature_vector.npy", concat_test_feature_vector, delimiter = ',')
        
    return None


get_and_save_feature_vector() 
    

# read feature vector from file

In [3]:
concat_test_feature_vector = np.loadtxt(open("concat_test_feature_vector.npy","rb"), delimiter=",").astype(np.float32)
concat_train_feature_vector = np.loadtxt(open("concat_train_feature_vector.npy","rb"), delimiter=",").astype(np.float32)

In [4]:
concat_train_feature_vector.shape

(25000, 5120)

In [5]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline

labels = []
for train_image_dir in train_dir:
    if 'dog' in train_image_dir:
        labels.append(1)
    else:
        labels.append(0)
        
y_train = np.array(labels)
x_train, y_train = shuffle(concat_train_feature_vector, y_train)

In [6]:
y_train

array([1, 1, 0, ..., 1, 0, 1])

# define new model and predict

In [9]:
inputs = Input(shape = (x_train.shape[1], ))
x = Dropout(0.5)(inputs)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs = inputs, outputs = predictions)
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=128, nb_epoch=8, validation_split=0.2)
y_pre = model.predict(concat_test_feature_vector)
y_pre



Train on 20000 samples, validate on 5000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


array([[  1.00000000e+00],
       [  8.01979768e-05],
       [  1.04454593e-05],
       ..., 
       [  1.00000000e+00],
       [  9.30272051e-07],
       [  2.91424931e-06]], dtype=float32)

In [36]:
y_pre = y_pre.clip(min=0.005, max=0.995)
import csv

def sort_y_pre_by_id(y_pre, test_dir):
    y_pre_order_by_id = np.zeros((len(y_pre),))
    for i in range(len(test_dir)):
        idx = int(test_dir[i][10:-4]) - 1
#         print([i, idx])
        y_pre_order_by_id[idx] = y_pre[i]
    return y_pre_order_by_id

y_pre_order_by_id = sort_y_pre_by_id(y_pre, test_dir)

#python2可以用file替代open
with open("submission.csv","w") as csvfile: 
    writer = csv.writer(csvfile)
    #先写入columns_nameΩΩΩ
    writer.writerow(["id","label"])
    for i in range(len(y_pre_order_by_id)):
        writer.writerow([i + 1, y_pre_order_by_id[i]])