In [4]:
import os
import shutil

%store -r 

#drop rows where hair_color = -1 to improve training accuracy
train = train[train.hair_color != -1]
validate = validate[validate.hair_color != -1]
test = test[test.hair_color != -1]

#restore train, validate, and test dataframes from other notebook
def reset_images_to_main_dir(delete_dirs=True):
    dest_dir = './images/'
    walker = os.walk(dest_dir, followlinks=False)
    rem_dirs = walker.__next__()[1]
 
    for data in walker:
        for files in data[2]:
            try:
                shutil.move(data[0] + os.sep + files, dest_dir)
            except shutil.Error:
                print(shutil.Error)
                continue
    if delete_dirs:
        for dirs in rem_dirs:
            shutil.rmtree(dest_dir + os.sep + dirs)

def symlink_classes_images(train, validate, test, reset_images=False, 
                           delete_dirs=False):
    if reset_images:
        reset_images_to_main_dir(delete_dirs=delete_dirs)
    imagesPath = os.path.abspath('./images/')
    trainPath = os.path.abspath('./images/train/haircolor')
    validatePath = os.path.abspath('./images/validate/haircolor')
    testPath = os.path.abspath('./images/test/haircolor')
    newPaths = [trainPath, validatePath, testPath]
    dfList = [train, validate, test]
    for path in newPaths:
        if os.path.isdir(path)==False:
            os.mkdir(path)
    
    for df in dfList:
        for idx, row in df.iterrows():
            hair_color = int(row['hair_color'])
            pathAppend = str(hair_color)+'/'
            
            fileName = str(int(row['file_name']))+'.png'
            filePath = os.path.join(imagesPath, fileName)
            trainClassPath = os.path.join(trainPath, pathAppend)
            trainFilePath = os.path.join(trainClassPath, fileName)
            validateClassPath = os.path.join(validatePath, pathAppend)
            validateFilePath = os.path.join(validateClassPath, fileName)
            testClassPath = os.path.join(testPath, pathAppend)
            testFilePath = os.path.join(testClassPath, fileName)
            classPaths = [trainClassPath, validateClassPath, testClassPath]
            for path in classPaths:
                if os.path.isdir(path)==False:
                    os.mkdir(path)
            if(df.equals(test)):
                newFilePath = testFilePath
            if(df.equals(train)):
                newFilePath = trainFilePath
            if(df.equals(validate)):
                newFilePath = validateFilePath
            if os.path.isfile(filePath):
                os.symlink(filePath, newFilePath)
            elif os.path.isfile(testFilePath):
                os.symlink(testFilePath, newFilePath)
            elif os.path.isfile(validateFilePath):
                os.symlink(validateFilePath, newFilePath)
            elif os.path.isfile(trainFilePath):
                os.symlink(trainFilePath, newFilePath)
            else:
                print("File missing: ", fileName)
                
symlink_classes_images(train, validate, test, reset_images=False, delete_dirs=False)

In [7]:
train_hair = train[['file_name', 'hair_color']]
train_hair = train_hair.reset_index(drop=True)
validate_hair = validate[['file_name', 'hair_color']]
validate_hair = validate_hair.reset_index(drop=True)
test_hair = test[['file_name', 'hair_color']]
test_hair = test_hair.reset_index(drop=True)

In [9]:
n_hair = train_hair['hair_color'].value_counts()
print(n_hair)

 1    602
 3    567
 4    483
-1    387
 5    322
 2    320
 0     58
Name: hair_color, dtype: int64


In [11]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU

K.set_image_dim_ordering('th')

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(3, 128, 128)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
#model.add(LeakyReLU(alpha=0.3))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [12]:
import os
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

def get_ImageDataGenerator(shear_range=1, 
                           zoom_range=[-1,1], 
                           horizontal_flip=True,
                           rotation_range=30,
                           zca_whitening=True):
    datagen = ImageDataGenerator(
                rescale=1./255,
                shear_range=shear_range,
                zoom_range=zoom_range,
                zca_whitening=zca_whitening,
                horizontal_flip=horizontal_flip,
                rotation_range = rotation_range,
                data_format='channels_first')
    
    return datagen

def get_flow_from_directory(ImageDataGenerator,
                            img_dir,
                            target_size=(128,128),
                            class_mode='binary',
                            batch_size=32,
                            shuffle=True,
                            seed=123):
    gen = ImageDataGenerator.flow_from_directory(
                            directory=img_dir,
                            target_size=target_size,
                            class_mode=class_mode,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            seed=seed,
                            follow_links=True)
    return gen


# this is the augmentation configuration we will use for training
train_datagen = get_ImageDataGenerator(shear_range=0,
                                       zoom_range=0,
                                       horizontal_flip=False)

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = get_ImageDataGenerator()
#Takes the dataframe and the path to a directory and 
#generates batches of augmented/normalized data.

trainPath = './images/train/haircolor/'
validatePath = './images/validate/haircolor/'
batch_size = 32
target_size=(128,128)
class_mode='categorical'
shuffle=True
seed=123

train_gen = get_flow_from_directory(train_datagen,
                                    trainPath,
                                    target_size,
                                    class_mode,
                                    batch_size,
                                    shuffle,
                                    seed)

validation_gen = get_flow_from_directory(test_datagen,
                                    validatePath,
                                    target_size,
                                    class_mode,
                                    batch_size,
                                    shuffle,
                                    seed)

Found 2739 images belonging to 7 classes.
Found 913 images belonging to 7 classes.


In [13]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from sklearn.utils import class_weight

#class_weights = class_weight.compute_class_weight('balanced',
                                                 #['cartoon', 'human'],
                                                 #train_human.head(2720)['human'])

earlyStopping = EarlyStopping(monitor='val_loss',
                              min_delta=0.001,
                              patience=0,
                              verbose=1, 
                              mode='auto',
                              restore_best_weights=True)

LR = ReduceLROnPlateau(monitor='val_loss', 
                       factor=0.2, 
                       patience=0, 
                       verbose=1, 
                       mode='auto', 
                       min_delta=0.01, 
                       cooldown=0, 
                       min_lr=0.001)
train_gen.reset()
validation_gen.reset()

train_steps=train_gen.n/train_gen.batch_size
valid_steps=validation_gen.n/validation_gen.batch_size

In [14]:
print(train_steps, valid_steps)
print(train_steps*train_gen.batch_size, valid_steps*validation_gen.batch_size)
model.summary()

85.59375 28.53125
2739.0 913.0
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 32, 128, 128)      896       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 32, 126, 126)      9248      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 32, 63, 63)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 32, 63, 63)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 32, 63, 63)        9248      
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 64, 61, 61)        18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 64, 30, 3

In [15]:
model.fit_generator(generator=train_gen,
                    steps_per_epoch=train_steps,
                    validation_data=validation_gen,
                    validation_steps=valid_steps,
                    epochs=50,
                    callbacks=[earlyStopping, LR],
                    class_weight = 'auto')

#model.save_weights('.h5')
model.save_weights('hair_detection_from_dir.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 6/50
Epoch 7/50
Restoring model weights from the end of the best epoch

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 00007: early stopping


In [16]:
validation_gen = get_flow_from_directory(test_datagen,
                                        validatePath,
                                        target_size,
                                        class_mode,
                                        1,
                                        False)

validation_gen.reset()

model.evaluate_generator(generator=validation_gen, steps=913, verbose=1)

Found 913 images belonging to 7 classes.


[0.5078581957352798, 0.7929901423877328]

In [17]:
testPath = './images/test/haircolor'


test_gen = get_flow_from_directory(test_datagen,
                                    testPath,
                                    target_size,
                                    class_mode,
                                    1,
                                    False)

test_gen.reset()

model.evaluate_generator(generator=test_gen, steps=913, verbose=1)

Found 913 images belonging to 7 classes.


[0.5042252017561488, 0.7995618838992333]

In [18]:
test_gen.reset()
probabilities = model.predict_generator(test_gen,verbose=1,steps=len(test_gen))



In [31]:
import pandas as pd
import numpy as np
output_df = pd.DataFrame(
    {'filename': test_gen.filenames,
     'y_true': test_gen.classes,
     'y_pred': probabilities.argmax(axis=-1).flatten()
    })

In [30]:
probabilities.argmax(axis=-1)

array([5, 6, 4, 0, 5, 4, 0, 0, 0, 0, 5, 2, 0, 0, 5, 5, 4, 0, 0, 0, 0, 5,
       0, 0, 5, 4, 5, 0, 0, 5, 5, 4, 0, 0, 0, 0, 6, 5, 0, 0, 5, 0, 0, 5,
       5, 5, 0, 4, 2, 0, 0, 0, 4, 4, 4, 0, 5, 2, 5, 4, 5, 0, 4, 4, 4, 0,
       0, 5, 0, 4, 0, 0, 4, 0, 0, 4, 0, 0, 0, 2, 4, 5, 0, 5, 0, 5, 6, 0,
       6, 2, 0, 0, 5, 4, 0, 0, 0, 5, 0, 0, 4, 5, 0, 0, 0, 5, 2, 2, 0, 0,
       0, 6, 0, 0, 0, 6, 0, 0, 4, 0, 0, 5, 5, 0, 2, 5, 0, 0, 4, 0, 4, 0,
       0, 0, 0, 0, 0, 0, 4, 5, 0, 4, 0, 0, 5, 0, 0, 1, 6, 2, 2, 5, 0, 4,
       4, 2, 0, 0, 4, 2, 2, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [32]:
output_df_wrong = output_df.query('y_true != y_pred')
output_df_wrong

Unnamed: 0,filename,y_true,y_pred
0,-1/101.png,0,5
1,-1/102.png,0,6
2,-1/1044.png,0,4
4,-1/1058.png,0,5
5,-1/1088.png,0,4
10,-1/1185.png,0,5
11,-1/1295.png,0,2
14,-1/1343.png,0,5
15,-1/1372.png,0,5
16,-1/1388.png,0,4


In [34]:
from sklearn.metrics import confusion_matrix
from keras.utils.np_utils import to_categorical
y_true = test_gen.classes
y_pred = probabilities.argmax(axis=-1)
confusion_matrix(y_true=y_true, y_pred=y_pred)

array([[ 78,   0,   8,   0,  24,  31,   6],
       [  3,   1,   4,   0,   3,   1,   1],
       [  5,   1, 188,   1,   1,   0,   3],
       [  0,   0,   1, 115,   1,   0,   0],
       [ 29,   0,   7,   0, 138,   5,   1],
       [ 20,   0,   1,   0,  13, 109,   0],
       [  8,   0,   3,   0,   2,   0, 101]])