# Update the image paths for the extracted features from Jan 29, 2018
March 27, 2018

Continue the work from `find_original_images.ipynb`. Use Keras to recompute the hashes of all my images, and see if I can find all of the original 1014 that I trained the model on.

In [1]:
import numpy as np
import pandas as pd
import imagehash
from keras.preprocessing import image

Using TensorFlow backend.


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#Solution to importing modules from a parallel directory from StackOverflow:
#https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
from src.image_manager import ImageManager

## Create an ImageManager from my previously saved file database (saved as .csv's), then get a copy of its data frame
See `more_image_manager_and_scraper.ipynb` from Feb 5, 2018

In [28]:
base_directory = '../tree_photos/'
images_path = '../data/image_log_20180205.csv'
syncs_path = '../data/image_syncs_20180205.csv'

In [30]:
manager = ImageManager(base_directory, images_path, syncs_path)

In [31]:
image_df = manager.image_df.copy()
image_df.head()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,2018-02-05 15:00:09.809644,2018-02-05 15:00:09.809644
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,2018-02-05 15:00:09.819089,2018-02-05 15:00:09.819089
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,2018-02-05 15:00:09.827989,2018-02-05 15:00:09.827989
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,2018-02-05 15:00:09.836027,2018-02-05 15:00:09.836027
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,2018-02-05 15:00:09.842604,2018-02-05 15:00:09.842604


## Open each image in `image_df` using Keras, compute its p_hash, and store the results in `image_df`

See `feature_extraction.ipynb` from Jan 29, 2018 to see how I originally computed the image hashes.

Method of applying a fumction to rows based on StackOverflow:
https://stackoverflow.com/questions/26886653/pandas-create-new-column-based-on-values-from-other-columns

In [12]:
def p_hash_keras299(row):
    """Given a row of image_df, computes the p_hash of the image for that row
    after reading in the image using Keras with a target size of (299,299).
    """
    path = os.path.join(base_directory, row['folder'], row['filename'])
    img = image.load_img(path, target_size=(299,299))
    return imagehash.phash(img)

In [13]:
#Compute the keras299 p-hashes of all images in image_df
#Note that axis=1 means the function operates on rows rather than columns:
#"Objects passed to functions are Series objects having index either the
#DataFrame’s index (axis=0) or the columns (axis=1)."
keras299_hashes = image_df.apply(p_hash_keras299, axis = 1)
keras299_hashes

0       9bb759cb29301aa5
1       99c20b3b74d53de0
2       a1e1e0d4e646e765
3       abee94c19cb28b0e
4       d3c161631f2b4b4e
5       c891b35fed643432
6       b34194e34c0bf5d3
7       db7431cef6104de0
8       d00f7c177d248d27
9       9f62153773335318
10      e36eff01f861c023
11      81973f6f2589a3b0
12      fdbcc03dd2218971
13      917b2bc1a3c342f5
14      d58b0b94dc6d49b4
15      8352aed9ac3b9239
16      dbc7820a759066f5
17      acf8e107b714d217
18      b78b4a52e9b2d245
19      d81ab7cc5c308adb
20      a2b3d32bd20cb659
21      c15f3f30a64fa21c
22      e54c9673cd643949
23      ae8cbc09d42ad33b
24      82c178cd64b13f9b
25      e835a8e557075716
26      c2569b9d61578a65
27      d3942c6b323d3d34
28      92a66f9f24c1832f
29      fdbf4e100ea10778
              ...       
5730    aab2196d29996ab6
5731    d1ee70104ef801ff
5732    a3dc685e9758824f
5733    9cc0f86948b3e679
5734    b4034ad838bd95cf
5735    e5314d6c76d58632
5736    d508ca8f1c7f11d3
5737    ae97d7486b6b7100
5738    c4b0f80abfa16fc8


In [14]:
image_df['p_hash_keras299'] = keras299_hashes
image_df.head()

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,1517842809809644000,1517842809809644000,9bb759cb29301aa5
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,1517842809819089000,1517842809819089000,99c20b3b74d53de0
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,1517842809827989000,1517842809827989000,a1e1e0d4e646e765
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,1517842809836027000,1517842809836027000,abee94c19cb28b0e
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,1517842809842604000,1517842809842604000,d3c161631f2b4b4e


## Find images where the hashes don't match

It looks like there are 418 such images, or 7.26% of all 5760 images

In [15]:
image_df[image_df['p_hash'] != image_df['p_hash_keras299']]

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,1517842809809644000,1517842809809644000,9bb759cb29301aa5
7,db7431cef4184de0,image_acer_macrophylum_tree_75.png,acer_macrophyllum,1517842809866679000,1517842809866679000,db7431cef6104de0
66,e2bc1e96d70632b1,image_acer_macrophylum_tree_62.png,acer_macrophyllum,1517842810339740000,1517842810339740000,e2bc1e86d70636b1
75,f69c29b94e4b9c24,image_bigleaf_maple_forest_33.png,acer_macrophyllum,1517842810412334000,1517842810412334000,f69c29b94e4f9424
78,b790d34b9e69c18a,image_acer_macrophylum_seeds_78.png,acer_macrophyllum,1517842810433250000,1517842810433250000,b790d34b9e69858a
108,88557b1f5ed910b1,image_acer_macrophylum_tree_67.png,acer_macrophyllum,1517842810676443000,1517842810676443000,88557b1f5ec911b1
119,f2bcbf6929a602b0,image_bigleaf_maple_forest_22.png,acer_macrophyllum,1517842810760778000,1517842810760778000,f2bcbf6929a602a1
164,a49b5825679b1a75,image_bigleaf_maple_branches_66.png,acer_macrophyllum,1517842811132863000,1517842811132863000,e49b5825679a1a75
173,ebf4e6d7605b200a,image_acer_macrophylum_seeds_95.png,acer_macrophyllum,1517842811205398000,1517842811205398000,ebf4e6d7604b220a
179,99bca3b3d440c177,image_acer_macrophylum_leaves_49.png,acer_macrophyllum,1517842811248722000,1517842811248722000,9bb8a3b3d440c177


## Load the previously extracted features and convert the hash strings to imagehash objects
See `build_a_model.ipynb` from Jan 30, 2018

In [16]:
tree_features_df = pd.read_csv('../data/tree_images.csv', sep='|', index_col=0)
tree_features_df.head()

Unnamed: 0,p_hash,filename,species,tags,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,99a7465a1e99acb4,image_picea_sitchensis_27.png,picea_sitchensis,,13.995308,1.316715,0.279829,67.839417,43.733177,38.206165,...,160.387985,1.176492,45.98373,10.465877,22.118896,3.226861,29.401524,0.486684,0.0,15.634432
1,93d12f8e712ef068,image_picea_sitchensis_33.png,picea_sitchensis,,7.449574,0.417233,0.447665,6.178374,2.092985,6.251201,...,2.649559,0.523397,0.39252,7.703988,13.842188,12.90754,1.112756,0.0,5.436511,1.154409
2,cce3482eb991533b,image_picea_sitchensis_6.png,picea_sitchensis,,0.708871,0.640444,1.305804,9.565918,15.157275,1.787687,...,15.114614,0.0,0.066235,2.280773,8.082823,1.939496,0.041136,0.0,0.0,0.034437
3,c119991c4fcf1cda,image_picea_sitchensis_in_winter_12.png,picea_sitchensis,,7.138457,1.225646,15.940742,42.644821,23.604031,13.818824,...,49.895081,0.0,7.600242,3.260593,13.135204,12.249097,17.58807,0.0,0.0,0.0
4,e3c3dab100fdc29c,image_sitka_spruce_59.png,picea_sitchensis,,0.030419,2.616714,1.43536,52.145359,5.377345,5.631077,...,13.391465,21.970516,5.217728,8.282942,0.788009,0.591635,35.845409,0.0,0.0,2.780651


In [17]:
tree_features_df.loc[:,'p_hash'] = tree_features_df['p_hash'].apply(imagehash.hex_to_hash)
tree_features_df.loc[0,'p_hash']

array([[ True, False, False,  True,  True, False, False,  True],
       [ True, False,  True, False, False,  True,  True,  True],
       [False,  True, False, False, False,  True,  True, False],
       [False,  True, False,  True,  True, False,  True, False],
       [False, False, False,  True,  True,  True,  True, False],
       [ True, False, False,  True,  True, False, False,  True],
       [ True, False,  True, False,  True,  True, False, False],
       [ True, False,  True,  True, False,  True, False, False]], dtype=bool)

In [19]:
tree_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1014 entries, 0 to 1013
Columns: 2052 entries, p_hash to incv3_out_2047
dtypes: float64(2049), object(3)
memory usage: 15.9+ MB


## Find images in image_df whose keras299 hashes match those of my original 1014 images

In [20]:
image_df['p_hash_keras299'].isin(tree_features_df['p_hash']).describe()

count      5760
unique        2
top       False
freq       4745
Name: p_hash_keras299, dtype: object

In [21]:
#See how many images match
5760 - 4745

1015

### Great, now I have one extra instead of 64 missing! WTF?!
Perhaps there are two (or more...) images in image_df that have the same keras299 p_hash even though they have different PIL p_hashes?

Um, yes, it looks like there are 42 repeats:

In [22]:
image_df['p_hash_keras299'].describe()

count                 5760
unique                5718
top       97b2784de2cc6139
freq                     2
Name: p_hash_keras299, dtype: object

In [23]:
image_df[image_df['p_hash_keras299'] == imagehash.hex_to_hash('97b2784de2cc6139')]

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299
2410,d3f79d0836e20dd0,image_cedrus_libani_49.png,cedrus_libani,1517842829658666000,1517842829658666000,97b2784de2cc6139
5063,97b2784de2cc6139,image_cedrus_libani_49.png,cedrus_libani,1517866167829710000,1517867726950389000,97b2784de2cc6139


### Ugh, why does the same file have two different p_hashes?!
Is there a bug in my ImageManager code?

In [26]:
image_df[image_df['p_hash_keras299'] == imagehash.hex_to_hash('d3f79d0836e20dd0')]
#So none of the keras299 hashes match the original p_hash of this image

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299


In [24]:
image_df['filename'].describe()
#So it looks like there are 40 repeated filenames

count                           5760
unique                          5720
top       image_cedrus_libani_94.png
freq                               2
Name: filename, dtype: object

In [25]:
image_df[image_df['filename'] == 'image_cedrus_libani_94.png']

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299
2330,e9e9c1178d1c965a,image_cedrus_libani_94.png,cedrus_libani,1517842828981828000,1517842828981828000,b293257039ce197e
4604,b293257039ce197e,image_cedrus_libani_94.png,cedrus_libani,1517866154904047000,1517867698086015000,b293257039ce197e


In [27]:
image_df[image_df['p_hash_keras299'] == imagehash.hex_to_hash('e9e9c1178d1c965a')]
#So none of the keras299 hashes match the original p_hash of this image

Unnamed: 0,p_hash,filename,folder,time_added,time_verified,p_hash_keras299
