In [71]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Importing and Formatting Data

In [3]:
# Image data stored in the data column. 
# Stored as a list of 6400*3 ints. First 6400 are red, next are green, the blue. 
# Images are 80x80px
data_json = pd.read_json('shipsnet.json')
data_json.head()

Unnamed: 0,data,labels,locations,scene_ids
0,"[82, 89, 91, 87, 89, 87, 86, 86, 86, 86, 84, 8...",1,"[-118.2254694333423, 33.73803725920789]",20180708_180909_0f47
1,"[76, 75, 67, 62, 68, 72, 73, 73, 68, 69, 69, 6...",1,"[-122.33222866289329, 37.7491755586813]",20170705_180816_103e
2,"[125, 127, 129, 130, 126, 125, 129, 133, 132, ...",1,"[-118.14283073363218, 33.736016066914175]",20180712_211331_0f06
3,"[102, 99, 113, 106, 96, 102, 105, 105, 103, 10...",1,"[-122.34784341495181, 37.76648707436548]",20170609_180756_103a
4,"[78, 76, 74, 78, 79, 79, 79, 82, 86, 85, 83, 8...",1,"[-122.34852408322172, 37.75878462398653]",20170515_180653_1007


In [19]:
rgbs = data_json['data']
rgbs.head()

0    [82, 89, 91, 87, 89, 87, 86, 86, 86, 86, 84, 8...
1    [76, 75, 67, 62, 68, 72, 73, 73, 68, 69, 69, 6...
2    [125, 127, 129, 130, 126, 125, 129, 133, 132, ...
3    [102, 99, 113, 106, 96, 102, 105, 105, 103, 10...
4    [78, 76, 74, 78, 79, 79, 79, 82, 86, 85, 83, 8...
Name: data, dtype: object

In [38]:
rgbs = np.stack(rgbs) # Make 2d np array
r = rgbs[:,:6400]
g = rgbs[:,6400:6400*2]
b = rgbs[:,6400*2:]
r.shape, g.shape, b.shape

((4000, 6400), (4000, 6400), (4000, 6400))

$$Black\ and\ white = \frac{r + g + b}{3}$$

In [81]:
black_white = (r+g+b)/3
# black_white = black_white.reshape([4000, 80, 80])
black_white.shape

(4000, 6400)

# Viewing Images

In [82]:
from ipywidgets import interact
import matplotlib.pyplot as plt
def browse_images(images, labels, categories):
    n = len(images)
    def view_image(i):
        plt.imshow(images[i], cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('%s' % categories[labels[i]])
        plt.axis('off')
        plt.show()
    interact(view_image, i=(0,n-1))
browse_images(black_white.reshape([4000, 80, 80]), data_json['labels'], ['Not Boat', 'Boat'])

# Applying Model

In [85]:
# Split data
test_vectors, train_vectors, test_labels, train_labels = train_test_split(black_white, data_json['labels'],test_size=0.8, train_size=0.2)


In [86]:
import time
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pickle

start = time.time()

#make some temporary variables so you can change this easily
tmp_vectors = train_vectors
tmp_labels = train_labels

print("Fitting the classifier to the training set")
# a dictionary of hyperparameters: key is the name of the parameter, value is a list of values to test
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
              'kernel': ['linear']}
# make a classifier by searching over a classifier and the parameter grid
clf = GridSearchCV(SVC(class_weight='balanced'), param_grid)

# we have a "good" classifier (according to GridSearchCV), how's it look
clf = clf.fit(tmp_vectors, tmp_labels)
print("Best estimator found by grid search:")
print(clf.best_estimator_)
print("Best parameters found by grid search:")
print(clf.best_params_)

end = time.time()
print("Runtime",end - start)

pickle.dump(clf, open('best_classifier.p', 'wb'))
# Opposite is clf = pickle.load(open(filename_pca, 'rb'))

Fitting the classifier to the training set
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best parameters found by grid search:
{'C': 1000.0, 'kernel': 'linear', 'gamma': 0.0001}
Runtime 3358.4540390968323


In [88]:
predictions = clf.predict(test_vectors)

In [91]:
sum(predictions == test_labels)/len(test_labels) # First run was 86% accurate.

0.8625

In [98]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions))
print(confusion_matrix(test_labels, predictions, labels=range(2)))

             precision    recall  f1-score   support

          0       0.95      0.87      0.90       601
          1       0.68      0.85      0.76       199

avg / total       0.88      0.86      0.87       800

[[520  81]
 [ 29 170]]
