In [1]:
import numpy as np
import visual_bow as bow
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.externals import joblib
import glob
import random

OpenCV VERSION (should be 3.1.0 or later, with nonfree modules installed!): 3.1.0


In [2]:
# Get all possible negative images and label them False
positive_folder='panda'
all_negs = [(path, False) for path in bow.neg_img_cal101(positive_folder)]
print '%i total negative imgs to choose from' % len(all_negs)
print all_negs[:5]

9106 total negative imgs to choose from
[('101_ObjectCategories/rooster/image_0014.jpg', False), ('101_ObjectCategories/rooster/image_0023.jpg', False), ('101_ObjectCategories/rooster/image_0040.jpg', False), ('101_ObjectCategories/rooster/image_0013.jpg', False), ('101_ObjectCategories/rooster/image_0038.jpg', False)]


In [3]:
# Get all the positive images you have (in the panda_rip folder) and label them True
positive_imgs = [(path, True) for path in glob.glob('panda_rip/*')]
print '%i positive images' % len(positive_imgs)
print positive_imgs[:5]

675 positive images
[('panda_rip/image_0014.jpg', True), ('panda_rip/75.JPEG', True), ('panda_rip/345.JPEG', True), ('panda_rip/30.JPEG', True), ('panda_rip/106.JPEG', True)]


In [4]:
# take N random negative images, where N is no of positive images
# then concatenate N pos + N neg and shuffle.
chosen_negs = random.sample(all_negs, len(positive_imgs))
imgs = chosen_negs + positive_imgs

np.random.shuffle(imgs)

print '%i total images (1:1 positive:negative)' % len(imgs)
print imgs[:5]

1350 total images (1:1 positive:negative)
[('panda_rip/825.JPEG', True), ('101_ObjectCategories/Faces/image_0013.jpg', False), ('101_ObjectCategories/BACKGROUND_Google/image_0090.jpg', False), ('101_ObjectCategories/octopus/image_0009.jpg', False), ('101_ObjectCategories/joshua_tree/image_0001.jpg', False)]


In [18]:
%%time
K_CLUSTERS = 250

# MiniBatchKMeans annoyingly throws tons of deprecation warnings that fill up the notebook. Ignore them.
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, X_val, y_train, y_test, y_val, cluster_model = bow.gen_bow_features(
    labeled_img_paths=imgs, 
    percent_test=0.15,
    percent_val=0.15,
    cluster_model=MiniBatchKMeans(n_clusters=K_CLUSTERS)
)

warnings.filterwarnings('default')

generating SIFT keypoints for 1350 images
Train-test-val split: 946 training rows, 202 test rows, 202 validation rows
390833 descriptors before clustering
Using clustering model MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10,
        n_clusters=250, n_init=3, random_state=None,
        reassignment_ratio=0.01, tol=0.0, verbose=0)...
Clustering on training set to get codebook of 250 words
done clustering
CPU times: user 10min 41s, sys: 22.8 s, total: 11min 3s
Wall time: 3min 37s


## Uncomment to pickle the SIFT features

In [19]:
for obj, obj_name in zip( [X_train, X_test, X_val, y_train, y_test, y_val], 
                         ['X_train', 'X_test', 'X_val', 'y_train', 'y_test', 'y_val'] ):
    joblib.dump(obj, 'pickles/feature_data/%s.pickle' % obj_name)

## Uncomment to UNpickle the SIFT features

In [20]:
# for obj_name in ['X_train', 'X_test', 'X_val', 'y_train', 'y_test', 'y_val']:
#     exec("{obj_name} = joblib.load('pickles/feature_data/{obj_name}.pickle')".format(obj_name=obj_name))
#     exec("print obj_name, len({0})".format(obj_name))

In [21]:
%%time
# c_vals = [0.0001, 0.01, 0.1, 1, 10, 100, 1000]
c_vals = [0.1, 1, 5, 10]
# c_vals = [1]

gamma_vals = [0.5, 0.1, 0.01, 0.0001, 0.00001]
# gamma_vals = [0.5, 0.1]
# gamma_vals = [0.1]

param_grid = [
  {'C': c_vals, 'kernel': ['linear']},
  {'C': c_vals, 'gamma': gamma_vals, 'kernel': ['rbf']},
 ]

svc = GridSearchCV(SVC(), param_grid, n_jobs=-1)
svc.fit(X_train, y_train)
print 'train score (mean accuracy):', svc.score(X_train, y_train)
print 'test score (mean accuracy):', svc.score(X_test, y_test)
print svc.best_estimator_

train score (mean accuracy): 0.918604651163
test score (mean accuracy): 0.861386138614
SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
CPU times: user 1.49 s, sys: 82.5 ms, total: 1.58 s
Wall time: 11.5 s


# We have our estimator, this is how it could classify random pictures

In [22]:
for img_path, label in random.sample(all_negs, 10):
    print img_path, svc.predict(bow.img_to_vect(img_path, cluster_model))

101_ObjectCategories/BACKGROUND_Google/image_0021.jpg ['True']
101_ObjectCategories/gerenuk/image_0015.jpg ['False']
101_ObjectCategories/Faces_easy/image_0098.jpg ['False']
101_ObjectCategories/Faces/image_0215.jpg ['False']
101_ObjectCategories/octopus/image_0005.jpg ['False']
101_ObjectCategories/Motorbikes/image_0369.jpg ['False']
101_ObjectCategories/starfish/image_0041.jpg ['False']
101_ObjectCategories/BACKGROUND_Google/image_0161.jpg ['False']
101_ObjectCategories/Motorbikes/image_0533.jpg ['False']
101_ObjectCategories/airplanes/image_0006.jpg ['False']


## Uncomment to pickle the best SVC classifier & kmeans

In [23]:
joblib.dump(svc.best_estimator_, 'pickles/svc/svc.pickle')
joblib.dump(cluster_model, 'pickles/cluster_model/cluster_model.pickle')

['pickles/cluster_model/cluster_model.pickle',
 'pickles/cluster_model/cluster_model.pickle_01.npy',
 'pickles/cluster_model/cluster_model.pickle_02.npy',
 'pickles/cluster_model/cluster_model.pickle_03.npy']

# Try AdaBoost, it's a common choice for SIFT features

In [24]:
%%time

MAX_ESTIMATORS = 200

ada = AdaBoostClassifier(n_estimators=MAX_ESTIMATORS)
ada.fit(X_train, y_train)
print 'train score (mean accuracy):', ada.score(X_train, y_train)
print 'test score (mean accuracy):', ada.score(X_test, y_test)

train score (mean accuracy): 0.998942917548
test score (mean accuracy): 0.831683168317
CPU times: user 2.66 s, sys: 3.91 ms, total: 2.66 s
Wall time: 2.63 s


## Uncomment to pickle the AdaBoostClassifier

In [26]:
joblib.dump(ada, 'pickles/ada/ada.pickle');
print 'picked adaboost'

picked adaboost


# TODO

* Separate out the clustering from the feature generation. They should be 2 different functions, the clustering should take the SIFT **training** data as an argument. It has labels already, right? Then you can save the SIFT data before clustering. Finally, you can do a grid search across K_CLUSTERS.

* Also it would be cool to graph the above.