# Test feature extraction code in new `image_classifier.py`, and extract features for 3 species

Sunday, May 13, 2018

In [1]:
import numpy as np
import pandas as pd
import imagehash
from keras.preprocessing import image

Using TensorFlow backend.


In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys

#Store the relative and absolute path to the home directory of the project
project_directory = '../..'
project_path = os.path.abspath(os.path.join(project_directory))

#Add the project path to the system path if necessesary, to be able to load modules from subdirectories
if project_path not in sys.path:
    sys.path.append(project_path)

In [6]:
#Import ImageManager to load our database, and ImageClassifier to extract features
from src.image_manager import ImageManager
from src.image_classifier import ImageClassifier

In [7]:
#Store directories of the photos and the files for the image database
photo_directory = os.path.join(project_path, 'tree_photos/')
image_df_path = os.path.join(project_path, 'data', 'image_log_20180511.csv')
syncs_df_path = os.path.join(project_path, 'data', 'image_syncs_20180511.csv')

## Load our image database by creating an ImageManager from the saved .csv's

In [9]:
#Create an image manager from our stored image database
manager = ImageManager(photo_directory, image_df_path, syncs_df_path)
manager.image_df.head()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,2018-05-11 18:13:24.267713,2018-05-11 18:13:24.267713
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,2018-05-11 18:13:24.276322,2018-05-11 18:13:24.276322
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,2018-05-11 18:13:24.284686,2018-05-11 18:13:24.284686
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,2018-05-11 18:13:24.292620,2018-05-11 18:13:24.292620
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,2018-05-11 18:13:24.298834,2018-05-11 18:13:24.298834


In [11]:
#Get a copy (or slice?) of the image dataframe without the timestamps.
#We will use this dataframe to store extracted features.
image_df = manager.image_df[['p_hash', 'filename', 'folder']]
image_df.head()

Unnamed: 0,p_hash,filename,folder
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum


## Create an ImageClassifier to extract features, and test the feature extraction code

This will load the pretrained Inception V3 network.

In [14]:
classifier = ImageClassifier()

In [40]:
feature_df = classifier.extract_features_from_path_df(image_df[0:5], photo_directory)
feature_df

0 images processed. Time = 2018-05-14 15:17:06.264983.
4 images processed. Time = 2018-05-14 15:17:08.452745.


Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,0.180042,0.057599,0.632024,0.021612,0.297033,0.011966,0.903157,...,0.36049,0.009282,0.055258,0.110056,0.058888,0.201237,0.198077,1.394304,0.155102,0.0
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,0.667066,0.249269,0.074132,0.177577,0.52058,0.167225,0.233005,...,1.446164,0.920353,0.823285,0.617185,0.121126,0.065866,0.876259,0.226926,0.274808,0.0
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,0.159188,0.011783,0.187351,0.211521,0.466182,0.387081,0.217815,...,1.198391,0.460702,0.157247,0.267633,0.006349,0.466702,0.641492,1.066418,0.145887,0.223863
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,0.658278,0.010192,0.344359,0.192652,0.342696,0.362848,0.562928,...,0.257585,0.077631,0.057183,0.411228,0.417154,0.048589,0.421282,0.057896,0.325496,0.145896
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,0.056695,0.255299,0.407071,0.066481,7.6e-05,0.568202,0.362414,...,1.289151,0.578341,0.963826,0.028785,0.310883,0.001386,1.154405,0.034431,0.768076,0.217323


Seems to be working, after fixing some bugs with the array dimensions.

## Create a dataframe containing just the 3 original species, and extract features from all images of those species

In [35]:
three_species_df = image_df[(image_df['folder'] == 'acer_macrophyllum')
                            | (image_df['folder'] == 'picea_sitchensis')
                            | (image_df['folder'] == 'platanus_acerifolia')
                           ]
three_species_df.head()

Unnamed: 0,p_hash,filename,folder
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum


In [36]:
three_species_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 5053
Data columns (total 3 columns):
p_hash      1639 non-null object
filename    1639 non-null object
folder      1639 non-null object
dtypes: object(3)
memory usage: 51.2+ KB


In [41]:
feature_df = classifier.extract_features_from_path_df(three_species_df, photo_directory)
feature_df.head()

0 images processed. Time = 2018-05-14 15:17:56.338002.
100 images processed. Time = 2018-05-14 15:18:40.234191.
200 images processed. Time = 2018-05-14 15:19:24.429318.
300 images processed. Time = 2018-05-14 15:20:08.951134.
400 images processed. Time = 2018-05-14 15:20:53.560533.
500 images processed. Time = 2018-05-14 15:21:38.687357.
600 images processed. Time = 2018-05-14 15:22:23.300011.
700 images processed. Time = 2018-05-14 15:23:07.649072.
800 images processed. Time = 2018-05-14 15:23:52.205023.
900 images processed. Time = 2018-05-14 15:24:36.184868.
1000 images processed. Time = 2018-05-14 15:25:20.327171.
1100 images processed. Time = 2018-05-14 15:26:09.168987.
1200 images processed. Time = 2018-05-14 15:26:56.805052.
1300 images processed. Time = 2018-05-14 15:27:41.682170.
1400 images processed. Time = 2018-05-14 15:28:28.623232.
1500 images processed. Time = 2018-05-14 15:29:12.992759.
1600 images processed. Time = 2018-05-14 15:29:57.737628.
1638 images processed. Tim

Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,0.180042,0.057599,0.632024,0.021612,0.297033,0.011966,0.903157,...,0.36049,0.009282,0.055258,0.110056,0.058888,0.201237,0.198077,1.394304,0.155102,0.0
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,0.667066,0.249269,0.074132,0.177577,0.52058,0.167225,0.233005,...,1.446164,0.920353,0.823285,0.617185,0.121126,0.065866,0.876259,0.226926,0.274808,0.0
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,0.159188,0.011783,0.187351,0.211521,0.466182,0.387081,0.217815,...,1.198391,0.460702,0.157247,0.267633,0.006349,0.466702,0.641492,1.066418,0.145887,0.223863
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,0.658278,0.010192,0.344359,0.192652,0.342696,0.362848,0.562928,...,0.257585,0.077631,0.057183,0.411228,0.417154,0.048589,0.421282,0.057896,0.325496,0.145896
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,0.056695,0.255299,0.407071,0.066481,7.6e-05,0.568202,0.362414,...,1.289151,0.578341,0.963826,0.028785,0.310883,0.001386,1.154405,0.034431,0.768076,0.217323


In [42]:
feature_df.tail()

Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
5049,d3bba6834b0a4b69,image_platanus_acerifolia_leaves_28.png,platanus_acerifolia,0.19323,0.089365,0.376612,0.072012,0.537932,0.566757,0.042445,...,0.686257,0.146267,0.367862,0.310418,0.0,0.84856,0.349558,1.125117,1.056942,0.336927
5050,efdee087d8d01815,image_london_plane_flowers_12.png,platanus_acerifolia,0.325498,0.377808,0.286472,0.040422,0.046696,0.154635,0.075962,...,0.171902,0.227308,0.19737,0.245082,0.028123,0.172103,1.110501,0.096858,0.241172,0.617346
5051,ffaa20f9d5455540,image_platanus_acerifolia_bark_19.png,platanus_acerifolia,0.282451,0.002418,0.184036,0.002895,1.042674,0.301266,0.418488,...,0.268578,0.239549,0.079454,0.007353,0.392959,0.35031,0.696242,1.62186,0.17626,0.003389
5052,c2c975f21a796946,image_platanus_acerifolia_bark_31.png,platanus_acerifolia,0.237729,0.048624,0.024316,0.182223,0.125612,0.178989,0.489525,...,0.742501,0.673016,0.585756,0.522944,0.116697,0.301665,0.259144,0.110905,0.536671,0.615156
5053,fed96202cf7a068c,image_platanus_acerifolia_bark_25.png,platanus_acerifolia,0.126663,0.012661,0.024963,0.006389,0.260473,0.147539,0.708515,...,1.24744,0.080645,0.027731,0.005829,0.090226,0.003723,0.008948,1.050955,0.737645,0.834034


It took about 45 seconds per 100 images to extract the features on my MacBook Pro. The overall time was 30:15 - 17:56 = 12:19 minutes, or 739 seconds, for 1639 images. That's 0.4509 seconds per image.

In [49]:
(12*60+19)/1639

0.45088468578401464

In [48]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 5053
Columns: 2051 entries, p_hash to incv3_out_2047
dtypes: float64(2048), object(3)
memory usage: 25.7+ MB


## Export the extracted features to a .csv