# Extract features from additional species

Friday, May 18, 2018

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import imagehash
from keras.preprocessing import image

Using TensorFlow backend.


In [3]:
import os
import sys

#Store the relative and absolute path to the home directory of the project
project_directory = '../..'
project_path = os.path.abspath(os.path.join(project_directory))

#Add the project path to the system path if necessesary, to be able to load modules from subdirectories
if project_path not in sys.path:
    sys.path.append(project_path)

In [59]:
#Import ImageManager to load/create our database, and ImageClassifier to extract features
import src.image_manager as im
from src.image_manager import ImageManager
from src.image_classifier import ImageClassifier

In [5]:
#Store directories of the photos and the files for the image database
photo_directory = os.path.join(project_path, 'tree_photos/')
old_image_df_path = os.path.join(project_path, 'data', 'image_log_20180511.csv')
old_syncs_df_path = os.path.join(project_path, 'data', 'image_syncs_20180511.csv')

## Test syncing updates into old image manager vs. creating one from scratch...

In [6]:
old_manager = ImageManager(photo_directory, old_image_df_path, old_syncs_df_path)
new_manager = ImageManager(photo_directory)

In [7]:
old_manager.sync_images()

In [8]:
old_manager.image_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5720 entries, 0 to 5719
Data columns (total 5 columns):
p_hash           5720 non-null object
filename         5720 non-null object
folder           5720 non-null object
time_added       5720 non-null datetime64[ns]
time_verified    5720 non-null datetime64[ns]
dtypes: datetime64[ns](2), object(3)
memory usage: 268.1+ KB


In [9]:
old_manager.get_duplicates()

{array([[ True,  True,  True,  True,  True,  True, False,  True],
        [False,  True,  True, False,  True,  True,  True, False],
        [ True,  True,  True,  True, False, False,  True, False],
        [False, False,  True,  True, False,  True, False,  True],
        [ True, False, False, False,  True,  True,  True, False],
        [ True, False, False, False, False, False, False, False],
        [False, False, False, False, False,  True,  True, False],
        [False, False,  True,  True, False,  True, False,  True]], dtype=bool): ['/Users/ndbs/tree-logic/tree_photos/bad_images/image_betula_pendula_671.png',
  '/Users/ndbs/tree-logic/tree_photos/betula_pendula/image_betula_pendula_671.png'],
 array([[ True, False,  True,  True,  True,  True,  True, False],
        [False, False, False,  True,  True, False,  True, False],
        [ True,  True,  True,  True, False, False,  True,  True],
        [ True,  True,  True, False,  True, False, False, False],
        [False, False, False, 

In [14]:
len(old_manager.get_duplicates())

129

I'm wary of calling `remove_duplicates()`, for fear of deleting files that shouldn't be deleted. In this case, what **should** happen is it should just remove outdated paths from the dictionary, and not actually delete any files, but I'm not sure how to easily verify that this is what will actually happen, given that there are 129 files to check...

In [10]:
old_df = old_manager.image_df
old_df.head()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,2018-05-11 18:13:24.267713,2018-05-18 19:16:21.868071
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,2018-05-11 18:13:24.276322,2018-05-18 19:16:21.922157
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,2018-05-11 18:13:24.284686,2018-05-18 19:16:21.968920
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,2018-05-11 18:13:24.292620,2018-05-18 19:16:22.015716
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,2018-05-11 18:13:24.298834,2018-05-18 19:16:22.058224


In [13]:
old_df[old_df['folder']=='bad_images'].tail()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
5685,ea4e83e5c8d98d25,image_alnus_rubra_216.png,bad_images,2018-05-11 18:14:09.762272,2018-05-18 19:20:44.923528
5696,9899638654f0e3af,image_alnus_rubra_362.png,bad_images,2018-05-11 18:14:09.854753,2018-05-18 19:20:52.603519
5697,ff8c9f0e801e9c1c,image_alnus_rubra_404.png,bad_images,2018-05-11 18:14:09.859830,2018-05-18 19:20:50.553448
5699,81d43e2a4474eef3,image_alnus_rubra_376.png,bad_images,2018-05-11 18:14:09.875807,2018-05-18 19:20:48.697751
5715,e3d1a51ade1ed221,image_alnus_rubra_161.png,bad_images,2018-05-11 18:14:10.009734,2018-05-18 19:20:43.238080


Ok, good, it looks like the newly added images are the ones I moved to the `bad_images` folder, which is the expected behavior. These **should** be the only images that were counted as duplicates because the old dataframe would have had the old paths stored, and running `sync_images()` would have found the new path instead, but there should still be only one copy of each image.

Ok, I temporarily commented out the code that deletes images in `remove_duplicates()` and replaced it with a print statement so I can test whether this worked without doing permanent damage:

In [34]:
old_manager.remove_duplicates()

Woo hoo! It didn't print anything, so it didn't actually find any duplicate files, just files with the wrong path. So it seems to be working correctly.

In [35]:
old_manager.get_duplicates()

{}

In [46]:
len(old_df[old_df['folder']=='acer_macrophyllum']) #One more than there used to be - I moved one file to this folder

527

### Let's create a new image database and compare it to the old one

Good, also no duplicates:

In [15]:
new_manager.sync_images()

In [16]:
new_manager.get_duplicates()

{}

In [17]:
new_manager.image_df.head()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,2018-05-18 19:28:48.351610,2018-05-18 19:28:48.351610
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,2018-05-18 19:28:48.376517,2018-05-18 19:28:48.376517
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,2018-05-18 19:28:48.401984,2018-05-18 19:28:48.401984
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,2018-05-18 19:28:48.425450,2018-05-18 19:28:48.425450
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,2018-05-18 19:28:48.448109,2018-05-18 19:28:48.448109


In [32]:
new_manager.image_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5720 entries, 0 to 5719
Data columns (total 5 columns):
p_hash           5720 non-null object
filename         5720 non-null object
folder           5720 non-null object
time_added       5720 non-null datetime64[ns]
time_verified    5720 non-null datetime64[ns]
dtypes: datetime64[ns](2), object(3)
memory usage: 428.1+ KB


In [29]:
columns = ['p_hash', 'filename', 'folder']
(new_manager.image_df[columns] == old_df[columns]).tail(30)

Unnamed: 0,p_hash,filename,folder
5690,False,False,True
5691,False,False,True
5692,False,False,True
5693,False,False,True
5694,False,False,True
5695,False,False,True
5696,False,False,False
5697,False,False,False
5698,False,False,True
5699,False,False,False


In [24]:
new_manager.image_df.loc[5690:5719,:]

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
5690,aab2196d29996ab6,image_alnus_rubra_570.png,alnus_rubra,2018-05-18 19:29:50.570651,2018-05-18 19:29:50.570651
5691,d1ee70104ef801ff,image_alnus_rubra_564.png,alnus_rubra,2018-05-18 19:29:50.579092,2018-05-18 19:29:50.579092
5692,a3dc685e9758824f,image_alnus_rubra_202.png,alnus_rubra,2018-05-18 19:29:50.587786,2018-05-18 19:29:50.587786
5693,9cc0f86948b3e679,image_alnus_rubra_148.png,alnus_rubra,2018-05-18 19:29:50.596089,2018-05-18 19:29:50.596089
5694,b4034ad838bd95cf,image_alnus_rubra_606.png,alnus_rubra,2018-05-18 19:29:50.605367,2018-05-18 19:29:50.605367
5695,e5314d6c76d58632,image_alnus_rubra_160.png,alnus_rubra,2018-05-18 19:29:50.614693,2018-05-18 19:29:50.614693
5696,d508ca8f1c7f11d3,image_alnus_rubra_174.png,alnus_rubra,2018-05-18 19:29:50.623825,2018-05-18 19:29:50.623825
5697,ae97d7486b6b7100,image_alnus_rubra_612.png,alnus_rubra,2018-05-18 19:29:50.632406,2018-05-18 19:29:50.632406
5698,c4b0f80abfa16fc8,image_alnus_rubra_438.png,alnus_rubra,2018-05-18 19:29:50.641443,2018-05-18 19:29:50.641443
5699,bf9929a434e162b3,image_alnus_rubra_76.png,alnus_rubra,2018-05-18 19:29:50.650410,2018-05-18 19:29:50.650410


In [25]:
old_df.loc[5690:5719,:]

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
5690,b4034ad838bd95cf,image_alnus_rubra_606.png,alnus_rubra,2018-05-11 18:14:09.803820,2018-05-18 19:20:41.297036
5691,e5314d6c76d58632,image_alnus_rubra_160.png,alnus_rubra,2018-05-11 18:14:09.812651,2018-05-18 19:20:41.338771
5692,d508ca8f1c7f11d3,image_alnus_rubra_174.png,alnus_rubra,2018-05-11 18:14:09.821449,2018-05-18 19:20:41.380738
5693,ae97d7486b6b7100,image_alnus_rubra_612.png,alnus_rubra,2018-05-11 18:14:09.829841,2018-05-18 19:20:41.429445
5694,c4b0f80abfa16fc8,image_alnus_rubra_438.png,alnus_rubra,2018-05-11 18:14:09.838400,2018-05-18 19:20:41.490577
5695,bf9929a434e162b3,image_alnus_rubra_76.png,alnus_rubra,2018-05-11 18:14:09.846906,2018-05-18 19:20:41.551005
5696,9899638654f0e3af,image_alnus_rubra_362.png,bad_images,2018-05-11 18:14:09.854753,2018-05-18 19:20:52.603519
5697,ff8c9f0e801e9c1c,image_alnus_rubra_404.png,bad_images,2018-05-11 18:14:09.859830,2018-05-18 19:20:50.553448
5698,d1ea6b8d651624da,image_alnus_rubra_410.png,alnus_rubra,2018-05-11 18:14:09.868094,2018-05-18 19:20:41.609942
5699,81d43e2a4474eef3,image_alnus_rubra_376.png,bad_images,2018-05-11 18:14:09.875807,2018-05-18 19:20:48.697751


Ahh, so it looks like the order of the files is different in the new dataframe vs. the old one. This could potentially be problematic if I use the indices to acces data during feature extraction or model training, so I need to be careful...

## Let's save old_manager's database to file, to have an ongoing record of when syncs occur

Let's remove the date from the filename and just keep reusing the same files by default. I may want to change ImageManager to use the new names as the default, and you'd have to explicitly pass in `None` if you want to create a new database from scratch. Except that I need the full path, not just the filenames...

In [30]:
old_manager.syncs_df

Unnamed: 0,time_started,time_completed,folders
0,2018-05-11 18:13:24.261373,2018-05-11 18:14:10.044293,"['acer_macrophyllum', 'betula_pendula', 'thuja..."
1,2018-05-18 19:16:21.762036,2018-05-18 19:20:54.404174,"['acer_macrophyllum', 'betula_pendula', 'thuja..."


In [31]:
new_manager.syncs_df

Unnamed: 0,time_started,time_completed,folders
0,2018-05-18 19:28:48.322951,2018-05-18 19:29:50.824053,"['acer_macrophyllum', 'betula_pendula', 'thuja..."


In [36]:
image_df_path = os.path.join(project_path, 'data', 'image_log.csv')
syncs_df_path = os.path.join(project_path, 'data', 'image_syncs.csv')

In [37]:
old_manager.export_logs(image_df_path, syncs_df_path)

In [38]:
!ls -l ../../data

total 214048
-rw-r--r--  1 ndbs  staff  64733256 May 15 22:05 features_3species_20180514.csv
-rw-r--r--  1 ndbs  staff     25651 Feb  7 16:01 gb_confusion.png
-rw-r--r--  1 ndbs  staff   3105147 Feb  1 16:44 gb_model_jan30.pkl
-rw-r--r--  1 ndbs  staff    706729 May 18 20:15 image_log.csv
-rw-r--r--  1 ndbs  staff    488281 Feb  5 00:30 image_log_20180204.csv
-rw-r--r--  1 ndbs  staff    711665 Feb  5 22:03 image_log_20180205.csv
-rw-r--r--  1 ndbs  staff    707031 May 11 18:34 image_log_20180511.csv
-rw-r--r--  1 ndbs  staff       611 May 18 20:15 image_syncs.csv
-rw-r--r--  1 ndbs  staff       141 Feb  4 22:51 image_syncs_20180204.csv
-rw-r--r--  1 ndbs  staff       427 Feb  5 22:03 image_syncs_20180205.csv
-rw-r--r--  1 ndbs  staff       324 May 11 18:34 image_syncs_20180511.csv
-rw-r--r--  1 ndbs  staff     82211 May 18 16:51 predictions_3species_test_20180518.csv
-rw-r--r--  1 ndbs  staff    328985 May 18 16:51 predictions_3species_train_20180518.csv
-rw-r--r--  1 nd

## Ok, let's get to some feature extraction!

In [48]:
species_names = ['thuja_plicata', 'alnus_rubra']#, 'cedrus_libani', 'pseudotsuga_menziesii']

In [43]:
image_df = old_df[columns]
image_df.head()

Unnamed: 0,p_hash,filename,folder
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum


In [50]:
two_species_df = image_df[image_df['folder'].isin(species_names)]
two_species_df.head()

Unnamed: 0,p_hash,filename,folder
1262,c5c3866ab4dc8637,image_western_red_cedar_tree_17.png,thuja_plicata
1263,83955f6a27e8417c,image_western_red_cedar_27.png,thuja_plicata
1264,d3d39896b086f45a,image_western_red_cedar_33.png,thuja_plicata
1265,8d889a9666a9d99b,image_thuja_plicata_branches_58.png,thuja_plicata
1266,9d00e21bd27c4fb5,image_thuja_plicata_cones_58.png,thuja_plicata


In [51]:
len(two_species_df)

1142

In [52]:
classifier = ImageClassifier()

In [54]:
feature_df = classifier.extract_features_from_path_df(two_species_df, photo_directory)
feature_df.head()

0 images processed. Time = 2018-05-18 20:43:12.682196.
100 images processed. Time = 2018-05-18 20:43:58.083745.
200 images processed. Time = 2018-05-18 20:44:43.942056.
300 images processed. Time = 2018-05-18 20:45:29.582039.
400 images processed. Time = 2018-05-18 20:46:14.847800.
500 images processed. Time = 2018-05-18 20:47:00.004316.
600 images processed. Time = 2018-05-18 20:47:45.247356.
700 images processed. Time = 2018-05-18 20:48:29.845546.
800 images processed. Time = 2018-05-18 20:49:14.631303.
900 images processed. Time = 2018-05-18 20:49:59.263198.
1000 images processed. Time = 2018-05-18 20:50:44.165270.
1100 images processed. Time = 2018-05-18 20:51:28.896780.
1141 images processed. Time = 2018-05-18 20:51:47.649298.


Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
1262,c5c3866ab4dc8637,image_western_red_cedar_tree_17.png,thuja_plicata,1.336279,0.011943,0.39557,0.218817,0.317935,0.056648,0.087848,...,0.433737,0.159791,0.323293,0.009519,0.02845,0.134239,0.136767,0.110591,0.925633,0.000594
1263,83955f6a27e8417c,image_western_red_cedar_27.png,thuja_plicata,0.043319,0.085838,0.34609,0.26924,0.104244,0.059988,0.322002,...,0.215635,0.03855,0.12375,0.001528,0.20711,0.146056,0.085234,0.05356,0.083258,0.380264
1264,d3d39896b086f45a,image_western_red_cedar_33.png,thuja_plicata,1.537653,0.012991,0.189,0.226903,0.465502,0.050164,0.166596,...,0.50862,0.23328,0.289102,0.243015,0.178973,0.02032,0.253384,0.242,0.159114,0.119313
1265,8d889a9666a9d99b,image_thuja_plicata_branches_58.png,thuja_plicata,0.777048,0.043122,0.167806,0.864698,0.690728,0.504618,0.055242,...,0.283768,0.297945,0.192668,0.231448,0.173066,0.011833,0.944232,0.390534,0.715745,0.0
1266,9d00e21bd27c4fb5,image_thuja_plicata_cones_58.png,thuja_plicata,0.117833,0.227984,0.097914,0.286688,0.123003,0.693201,0.113581,...,0.804989,0.391352,0.54805,0.849486,0.280309,0.712661,0.956842,0.001279,0.351022,0.132506


### Test how time deltas work to incorporate into verbose feature extraction code

In [55]:
time1 = pd.Timestamp.now()

In [56]:
time2 = pd.Timestamp.now()

In [57]:
time2-time1

Timedelta('0 days 00:00:14.007531')

In [58]:
f"diff={time2-time1}"

'diff=0 days 00:00:14.007531'

### Export first feature dataframe to file

In [60]:
feature_df_path = os.path.join(project_path, 'data', 'features_alnus_thuja.csv')

In [61]:
im.export_df(feature_df, feature_df_path)

### Ok, let's extract features for two more species

In [62]:
species_names = ['cedrus_libani', 'pseudotsuga_menziesii']
two_species_df = image_df[image_df['folder'].isin(species_names)]
two_species_df.head()

Unnamed: 0,p_hash,filename,folder
2379,cd3b73689096b9e0,image_cedrus_libani_348.png,cedrus_libani
2380,f20e69b613f10ed1,image_cedrus_libani_360.png,cedrus_libani
2381,b3e3541ceb6c049b,image_cedrus_libani_406.png,cedrus_libani
2382,a9a5564e99a75649,image_cedrus_libani_412.png,cedrus_libani
2383,80e1431edceb17b9,image_cedrus_libani_374.png,cedrus_libani


In [63]:
two_species_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1152 entries, 2379 to 3556
Data columns (total 3 columns):
p_hash      1152 non-null object
filename    1152 non-null object
folder      1152 non-null object
dtypes: object(3)
memory usage: 36.0+ KB


In [64]:
feature_df = classifier.extract_features_from_path_df(two_species_df, photo_directory)
feature_df.head()

0 images processed. Time = 2018-05-18 21:14:20.712064.
100 images processed. Time = 2018-05-18 21:15:09.305318.
200 images processed. Time = 2018-05-18 21:15:58.159089.
300 images processed. Time = 2018-05-18 21:16:47.540436.
400 images processed. Time = 2018-05-18 21:17:35.582893.
500 images processed. Time = 2018-05-18 21:18:21.954327.
600 images processed. Time = 2018-05-18 21:19:08.814283.
700 images processed. Time = 2018-05-18 21:19:55.589557.
800 images processed. Time = 2018-05-18 21:20:46.189072.
900 images processed. Time = 2018-05-18 21:21:33.559724.
1000 images processed. Time = 2018-05-18 21:22:20.878959.
1100 images processed. Time = 2018-05-18 21:23:08.849265.
1151 images processed. Time = 2018-05-18 21:23:32.287101.
Total time elapsed = 0 days 00:09:11.575060.


Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
2379,cd3b73689096b9e0,image_cedrus_libani_348.png,cedrus_libani,0.570395,0.295041,0.399557,0.004093,0.430144,0.026111,0.006597,...,0.502348,0.2221,0.707991,0.529502,0.252576,0.491637,2.030203,0.000779,0.107565,0.007273
2380,f20e69b613f10ed1,image_cedrus_libani_360.png,cedrus_libani,0.176714,0.108836,0.533615,0.166447,0.191445,0.017257,0.125915,...,0.375534,0.103528,0.125849,0.621964,0.002043,0.141729,0.545091,0.020497,0.2989,0.669948
2381,b3e3541ceb6c049b,image_cedrus_libani_406.png,cedrus_libani,0.487009,0.031866,0.456367,0.148815,0.159038,0.029393,0.447176,...,0.153956,0.309752,0.12938,1.057084,0.505882,0.234342,0.525729,0.027071,0.318918,0.368638
2382,a9a5564e99a75649,image_cedrus_libani_412.png,cedrus_libani,0.645351,0.089985,0.035995,0.025797,0.121042,0.167556,0.389564,...,0.394514,0.123905,0.516291,0.033663,0.513957,0.057169,0.286617,0.010999,0.282762,0.342531
2383,80e1431edceb17b9,image_cedrus_libani_374.png,cedrus_libani,0.374031,0.127293,0.0944,0.032075,0.289255,0.012815,0.27176,...,0.185907,0.65304,0.160685,0.339927,0.861876,0.123766,0.1387,0.0,0.366671,0.825225


### Export second feature dataframe to file

In [65]:
feature_df_path = os.path.join(project_path, 'data', 'features_cedrus_pseudotsuga.csv')

In [66]:
im.export_df(feature_df, feature_df_path)