In [11]:
import glob
import os.path
import random

import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile

from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit

In [21]:
BOTTLENECK_TENSOR_SIZE = 2048
TRAIN_BOTTLENECK_CACHE_DIR = 'tmp/bottleneck/'
FINAL_CACHE_DIR = 'tmp/final_bottleneck/'
#svm_bottleneck_dir = 'svm_cache_data/bottleneck'
#final_test_bottleneck_cache_dir = 'svm_cache_data/final_bottleneck/'

### Hyperparameters

In [13]:
LEARNING_RATE = 0.001
EPOCHS = 20
BATCH_SIZE = 1000

In [14]:
def shuffle_and_split_bottleneck():
    '''
    :Shuffled and split bottleneck from Cached directory
    :Return shuffled bottleneck file name numpy ndarray
    '''
    codes = np.array([code for code in os.listdir(TRAIN_BOTTLENECK_CACHE_DIRE)])
    labels = np.array([code[:3] for code in codes])
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    train_idx, val_idx = next(ss.split(codes,labels))
    
    train_x, train_y = codes[train_idx], labels[train_idx]
    val_x, val_y = codes[val_idx], labels[val_idx]
    return train_x, train_y, val_x, val_y

In [5]:
def get_bottleneck_value_from_file(x,y,cache_dir):
    """
    : x, y are filename list of features and labels respectively
    : return np.array
    """
    bottlenecks = []
    labels = []
    for i in range(len(x)):
        with open(os.path.join(cache_dir,x[i]),'r') as bottleneck_file:
            bottlenecks.append([float(xx) for xx in bottleneck_file.read().split(',')])
        labels.append([1 if y[i] == 'dog' else 0])
    return np.array(bottlenecks), np.array(labels)

In [7]:
def get_batches(x, y, cache_dir, batch_size=batch_size): # x是bottleneck所对应的名字的一个list，如‘dog.1023.txt’；y是类别所对应的名字的list，如‘dog’
    """
    : Get batches from x, y list
    : iterable batches of feature and labels
    """
    batch_num = len(x) // batch_size
    for ii in range(0, batch_num * batch_size, batch_size):
        if ii != (batch_num - 1) * batch_size:
            X, Y = x[ii: ii+batch_size], y[ii: ii+batch_size]
        else:
            X, Y = x[ii:], y[ii:]
        bottlenecks, labels = get_bottleneck_value_from_file(X, Y, cache_dir)
        yield bottlenecks, labels

In [46]:
def main():
    train_filelist_x, train_filelist_y, val_filelist_x, val_filelist_y = shuffle_and_split_bottleneck()
    train_x, train_y = get_bottleneck_value_from_file(train_filelist_x, train_filelist_y, TRAIN_BOTTLENECK_CACHE_DIR)
    train_y = np.squeeze(train_y)
    clf = svm.SVC(C=0.9, kernel='rbf')
    clf.fit(train_x, train_y)
    
    # validation
    val_x, val_y = get_bottleneck_value_from_file(val_filelist_x, val_filelist_y, TRAIN_BOTTLENECK_CACHE_DIR)
    pred = clf.predict(val_x)
    print(pred)
    print(val_y)
    correct_prediction = np.equal(pred, np.squeeze(val_y))
    accuracy = np.mean(correct_prediction.astype('float32'))
    print('Accuracy: {:.4f}'.format(accuracy))

In [47]:
if __name__ == '__main__':
    main()

[0 0 1 ..., 1 0 0]
[[0]
 [0]
 [1]
 ..., 
 [1]
 [0]
 [0]]
Accuracy: 0.9952
