导入库

In [3]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import time
import datetime
from sklearn.model_selection import train_test_split

import cv2

import h5py
%matplotlib inline

random_state = 424
dataset_path = 'dataset'

这个数据集是26位司机的各种样子的照片。所以我按照司机来划分训练集和验证集。

In [9]:
# 读取csv文件
dataset = pd.read_csv('dataset/driver_imgs_list.csv')

# 将前21为司机的照片作为训练集，后2位司机的照片作为测试集
drive_id_train = dataset['subject'].unique()[:-2]
drive_id_test = dataset['subject'].unique()[-2:]

# 组合路径
dataset['path']=dataset[['classname', 'img']].apply(lambda x: '/'.join(x), axis=1)

# 将路径转换成list格式
train_path = dataset['path'].loc[dataset['subject'].isin(drive_id_train)].tolist()
test_path = dataset['path'].loc[dataset['subject'].isin(drive_id_test)].tolist()

In [10]:
import os
import shutil


if os.path.exists('dataset/train2'):
    print('split dataset exist')
else:
    os.mkdir('dataset/train2')
    os.mkdir('dataset/valid2')
    for i in range(10):
        os.mkdir('dataset/train2/c'+str(i))
        os.mkdir('dataset/valid2/c'+str(i))
    # 复制图片到新的数据集
    if os.path.exists('dataset/train2'):
        print('train dataset copy begain')
        for filename in tqdm(train_path):
            shutil.copyfile('dataset/train/'+filename, 'dataset/train2/'+filename)
    if os.path.exists('dataset/valid2'):
        print('test dataset copy begain')
        for filename in tqdm(test_path):
            shutil.copyfile('dataset/train/'+filename, 'dataset/valid2/'+filename)




  0%|          | 0/20787 [00:00<?, ?it/s]

train dataset copy begain


100%|██████████| 20787/20787 [06:06<00:00, 56.70it/s] 
  0%|          | 0/1637 [00:00<?, ?it/s]

test dataset copy begain


100%|██████████| 1637/1637 [00:02<00:00, 612.65it/s]


预训练模型模板

In [16]:
from keras.models import Sequential, Model
from keras.applications import *
from keras.layers import Dropout, Flatten, Dense, Input, GlobalAveragePooling2D
from keras.layers.normalization import *
from keras.optimizers import *
from keras.preprocessing.image import *
from keras.callbacks import ModelCheckpoint

train_dir = 'dataset/train2'  # 训练集数据
val_dir = 'dataset/valid2' # 验证集数据
nb_classes = len(glob.glob(train_dir + "/*"))  # 分类数


batch_size = 128
epochs = 5
# def VGG16_model(X_train, y_train, X_test, y_test):
def run_a_model(BASE_MODEL,input_shape, fine_tune_layer, preprocessing=None, model_name_option=''):
    input_height = input_shape[0]
    input_width = input_shape[1]
    input_tensor = Input((input_height, input_width, 3))
    base_model = BASE_MODEL(input_tensor=Input((input_height, input_width, 3)),
                            weights='imagenet', 
                            include_top=False, 
                            input_shape=(input_height, input_width, 3)
                           )
    
    
    x = input_tensor
    x = GlobalAveragePooling2D()(base_model.output)
    # x = Dropout(0.5)(x)
    # try BN
    x = BatchNormalization()(x)
    x = Dense(10, activation='softmax')(x)
    model = Model(base_model.input, x)    
    print("total layer count {}".format(len(base_model.layers)))
    
    # 冻结基础层
    '''
    for layer in base_model.layers:
        layer.trainable = False
    '''
    for i in range(fine_tune_layer):
        model.layers[i].trainable = False
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    train_gen = ImageDataGenerator(
        rotation_range=10.,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.1,
        zoom_range=0.1,
        preprocessing_function=preprocessing
    )
    
    test_gen = ImageDataGenerator(
        preprocessing_function=preprocessing
    )
    
    train_generator = train_gen.flow_from_directory(train_dir, (input_height, input_width), 
                                                    shuffle=True, batch_size=batch_size, class_mode='categorical')
    test_generator = test_gen.flow_from_directory(val_dir, (input_height, input_width), 
                                                  shuffle=True, batch_size=batch_size, class_mode='categorical')
    
    
    steps_train_sample = train_generator.samples // batch_size + 1
    steps_valid_sample = test_generator.samples // batch_size + 1

    model.fit_generator(
        train_generator,
        steps_per_epoch=steps_train_sample,
        epochs=5,
        validation_data=test_generator,
        validation_steps=steps_valid_sample)
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.00001), metrics=['accuracy'])
    model.fit_generator(
        train_generator,
        steps_per_epoch=steps_train_sample,
        epochs=5,
        validation_data=test_generator,
        validation_steps=steps_valid_sample)
    
    
    model.save("models/model_{}{}.h5".format(BASE_MODEL.func_name, model_name_option))

    return

In [29]:
# VGG16_model(X_train, y_train, X_test, y_test)
run_a_model(VGG16, (224, 224), 18, optimizer='adam', preprocessing=None, model_name_option='')

total layer count 19
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [3]:
# ResNet50
run_a_model(ResNet50, (224, 224), 152, optimizer='adam', preprocessing=None, model_name_option='')

total layer count 175
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
run_a_model(InceptionV3, (299, 299), 172, optimizer='adam', preprocessing=inception_v3.preprocess_input, model_name_option='')

total layer count 311
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [5]:
run_a_model(Xception, (299, 299), 116, optimizer='adam', preprocessing=xception.preprocess_input, model_name_option='')

total layer count 132
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# ResNet50 156 finetune Layer 10 epoches
run_a_model(ResNet50, (240, 360), 154, preprocessing=None, model_name_option='_154')

total layer count 175
Found 21601 images belonging to 10 classes.
Found 823 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
# ResNet50 154 finetune layer with BN 10 epoches
run_a_model(ResNet50, (224, 224), 160, preprocessing=None, model_name_option='_BN_160')

total layer count 175
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
run_a_model(ResNet50, (224, 224), 165, preprocessing=None, model_name_option='_BN_165')
# 两代 50

In [19]:
# vgg16 with BN,finetune layer 15  10 epoche
run_a_model(VGG16, (224, 224), 15, preprocessing=None, model_name_option='_BN_15')

total layer count 19
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
run_a_model(InceptionV3, (299, 299), 180, preprocessing=inception_v3.preprocess_input, model_name_option='_BN_180')

total layer count 311
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
run_a_model(InceptionV3, (299, 299), 200, preprocessing=inception_v3.preprocess_input, model_name_option='_BN_200')

total layer count 311
Found 20787 images belonging to 10 classes.
Found 1637 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


基础预测

In [31]:
from keras.models import load_model
model = load_model('models/model_VGG16.h5')

In [33]:
gen = ImageDataGenerator()
path_test_data = 'dataset/to_prediction'
test_generator = gen.flow_from_directory(path_test_data,  (224, 224), shuffle=False, batch_size=batch_size, class_mode=None)

Found 79726 images belonging to 1 classes.


In [35]:
def load_test(img_rows, img_cols):
    print('Read test images')
    path = os.path.join(dataset_path, 'test', '*.jpg')
    files = glob.glob(path)
    X_test = []
    X_test_id = []
    total = 0
    thr = math.floor(len(files)/10)
    for fl in tqdm(files):
        flbase = os.path.basename(fl)
        img = get_img(fl, (img_rows, img_cols))
        X_test.append(img)
        X_test_id.append(flbase)
        total += 1

    return X_test, X_test_id


def make_predictions(MODEL, image_size, batch_size):
    gen = ImageDataGenerator()
    path_test_data = 'dataset/to_prediction'
    test_generator = gen.flow_from_directory(path_test_data,  image_size, shuffle=False, 
                                             batch_size=batch_size, class_mode=None)
    y_predictions = MODEL.predict_generator(test_generator,  steps=test_generator.samples//batch_size+1,  verbose=1)
    y_predictions = y_predictions.clip(min=0.005, max=0.995)
    
    test_id = list()
    for i, file_name in enumerate(test_generator.filenames):
        if i >5:
            break
        print(file_name)
        # name = fname[fname.rfind('/')+1:]
        # l.append( [name, *y_pred[i]] )
        
    
    return y_predictions

def create_submission(predictions, test_id):
    result1 = pd.DataFrame(predictions, columns=['c0', 'c1', 'c2', 'c3',
                                                 'c4', 'c5', 'c6', 'c7',
                                                 'c8', 'c9'])
    result1.loc[:, 'img'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    if not os.path.isdir('subm'):
        os.mkdir('subm')
    suffix = str(now.strftime("%Y-%m-%d-%H-%M"))
    sub_file = os.path.join('subm', 'submission_' + suffix + '.csv')
    result1.to_csv(sub_file, index=False)

In [25]:
def gen_kaggle_csv(model,  model_image_size, csv_name):
    dir = "dataset/"

    gen = ImageDataGenerator()
    test_generator = gen.flow_from_directory(dir + "test/",  model_image_size, shuffle=False, 
                                             batch_size=batch_size, class_mode=None)
    y_pred = model.predict_generator(test_generator,  steps=test_generator.samples//batch_size+1,  verbose=1)
    print("y_pred shape {}".format(y_pred.shape))
    y_pred = y_pred.clip(min=0.005, max=0.995)
    print(y_pred[:3])


In [None]:
# X_test_to_pred_test , X_test_id_test = load_test(224, 224)
X_test_to_pred = np.array(X_test_to_pred)
print("pred begin")
start = time.time()
y_pred = model.predict(X_test_to_pred, verbose=1)
end = time.time()
print(end - start)

In [114]:
create_submission(y_pred, X_test_id)