First, navigate to your chosen working directory.

In [1]:
# %cd ~/..
# %pwd

%matplotlib inline

In [2]:
random_seed = 34
import random
random.seed(random_seed)
import dataset

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from stuf import stuf

import os
gpu=5
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="6"#str(gpu)

from tensorflow.keras import backend as K
import tensorflow as tf;
tf.enable_eager_execution()

tf_config=tf.ConfigProto(log_device_placement=True)
tf_config.gpu_options.allocator_type = 'BFC'
tf_config.gpu_options.allow_growth = True
tf_config.allow_soft_placement = True
sess = tf.Session(config=tf_config)
K.set_session(sess)

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']


print(get_available_gpus())

import pyleaves

from pyleaves.analysis.img_utils import convert_to_png
from pyleaves import leavesdb
from pyleaves.data_pipeline.tensorpack_loaders import get_multiprocess_dataflow

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: TITAN X (Pascal), pci bus id: 0000:88:00.0, compute capability: 6.1
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device

['/device:GPU:0']
OpenCV is built with OpenMP support. This usually results in poor performance. For details, see https://github.com/tensorpack/benchmarks/blob/master/ImageNet/benchmark-opencv-resize.py
1 Physical GPUs, 1 Logical GPU


In [3]:
DATASET = "Fossil"
# tf.test.gpu_device_name()
# dir(tf.test)

#### **1.** Initialize and connect to database in local filesystem

In [4]:
# local_db = os.path.join(os.getcwd(),'pyleaves','leavesdb','resources','leavesdb.db')

local_db = leavesdb.init_local_db()

print(local_db)

db = dataset.connect(f'sqlite:///{local_db}', row_type=stuf)

Proceeding with sql db at location /home/jacob/scripts/leavesdb.db
/home/jacob/scripts/leavesdb.db


#### **2.** Print a summary of the database's contents

In [5]:
db_summary = leavesdb.summarize_db(db)

{'Database column keys': ['id', 'specie', 'genus', 'path', 'family', 'dataset'], 'distinct datasets': ['Fossil', 'Leaves', 'PNAS', 'plant_village'], 'Number of distinct families': [('Fossil', 27), ('Leaves', 376), ('PNAS', 19), ('plant_village', 3)], 'Number of rows in db': 119084}


#### **3.** Select a subset of datasets
##### Here we select the Fossil dataset

In [6]:
# data = leavesdb.db_query.load_Fossil_data(db)

# data = leavesdb.db_query.load_data(db, dataset=DATASET)

# data = leavesdb.db_query.load_Leaves_data(db)


data = leavesdb.db_query.load_all_data(db)

data_by_dataset = data.groupby(by='dataset')
data_by_dataset_dict = {k:v for k,v in data_by_dataset}


In [7]:
print(data.head(5))
import time

                                                path   family        dataset
0  /media/data_cifs/jacob/data/plantvillage/Apple...  Rosacea  plant_village
1  /media/data_cifs/jacob/data/plantvillage/Apple...  Rosacea  plant_village
2  /media/data_cifs/jacob/data/plantvillage/Apple...  Rosacea  plant_village
3  /media/data_cifs/jacob/data/plantvillage/Apple...  Rosacea  plant_village
4  /media/data_cifs/jacob/data/plantvillage/Apple...  Rosacea  plant_village


In [9]:
new_data_locations = {}
for dataset_name, rows in data_by_dataset_dict.items():
    filepaths = list(rows['path'].values)
    labels = list(rows['family'].values)
    print(dataset_name, len(filepaths), len(labels))
    
    
    filepaths = filepaths[:10]
    labels = labels[:10]
    
    num_files = len(filepaths)
    
    start_time = time.perf_counter()
    new_dataset_paths = convert_to_png(filepaths, labels, dataset_name = dataset_name)
    end_time = time.perf_counter()
    total_time = end_time-start_time
    print(f'Finished copying {num_files} from {dataset_name} in {total_time:.3f} at a rate of {num_files/total_time:.3f} images/sec')
    new_dataset_paths = list(new_dataset_paths)
    new_data_locations.update({dataset_name:new_dataset_paths})
    
    

Fossil 6122 6122
Converted image 4 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0335 Sambucus newtoni.png
True
Converted image 0 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0141 Sambucus newtoni.png
True
Converted image 8 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0338cu Sambucus newtoni.png
True
Converted image 6 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0338 Sambucus newtoni.png
True
Converted image 7 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0338cpt Sambucus newtoni.png
True
Converted image 3 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0224 Sambucus newtoni.png
Converted image 1 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0141cpt Sambucus newtoni.png
Converted image 5 and saved at /media/data/jacob/Fossil_Project/Fossil/Adoxaceae/CU_0335cu Sambucus newtoni.png
True
TrueTrue

Converted image 2 and saved at /media/data/jacob/Foss

#### **4.** Encode labels as integers for feeding into model

In [None]:
from pyleaves.data_pipeline.preprocessing import encode_labels

data_df = encode_labels(data)

data_df.sample(frac=1).head(10)

In [None]:
print(len(data_df))

In [None]:
from pyleaves.data_pipeline.preprocessing import filter_low_count_labels, one_hot_encode_labels, one_hot_decode_labels

test_size = 0.25
val_size = 0.25

data_df = filter_low_count_labels(data_df, threshold=3, verbose = True)

data_df = encode_labels(data_df) #Re-encode numeric labels after removing sub-threshold classes so that max(labels) == len(labels)

image_paths = data_df['path'].values.reshape((-1,1))
one_hot_labels = one_hot_encode_labels(data_df['label'].values)

In [None]:
# train_data, test_data = train_test_split(data_df, test_size=test_size, random_state=random_seed, shuffle=True, stratify=data_df['label'])
train_paths, test_paths, train_labels, test_labels  = train_test_split(image_paths, one_hot_labels, test_size=test_size, random_state=random_seed, shuffle=True, stratify=data_df['label'])

train_paths, val_paths, train_labels, val_labels = train_test_split(train_paths, train_labels, test_size=val_size, random_state=random_seed, shuffle=True, stratify=train_labels)


train_data = {'path': train_paths, 'label': train_labels}
val_data = {'path': val_paths, 'label': val_labels}
test_data = {'path': test_paths, 'label': test_labels}

data_splits = {'train': train_data,
              'val': val_data,
              'test': test_data}

# train_gen = get_multiprocess_dataflow(train_data['path'], train_data['label'], size=(299,299), batch_size=32, num_prefetch=25, num_proc=5)

## **Let's set up our model**

In [None]:
plot_class_frequencies = leavesdb.utils.plot_class_frequencies
    
plot_class_frequencies(labels=one_hot_decode_labels(train_data['label']).ravel().tolist());
plot_class_frequencies(labels=one_hot_decode_labels(val_data['label']).ravel().tolist());

In [None]:
num_classes = len(np.unique(data_df['label']))
img_size = [299,299]
channels = 3
batch_size = 32
learning_rate=0.01
num_epochs = 1

def parse_function(filename, label):
    img = tf.io.read_file(filename)
    img = tf.io.decode_jpeg(img, channels=channels)#, dtype=tf.float32)
    img = tf.image.resize(img, img_size)
    return img, label #{'image':img, 'label':label}

# def train_preprocess(img, label):
#     img = tf.image.resize(img, img_size)
#     return {'image':img, 'label':label}
    

def get_tf_dataset(filenames, labels):
    data = tf.data.Dataset.from_tensor_slices((filenames, labels))
    data = data.shuffle(len(filenames))
#     data = data.interleave((lambda x, y: tf.data.Dataset(x,y).map(parse_function, num_parallel_calls=1)), cycle_length=4, block_length=16)
    data = data.map(parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#     data = data.map(train_preprocess, num_parallel_calls=4)
    data = data.batch(batch_size)
    data = data.prefetch(tf.data.experimental.AUTOTUNE)
#     data = data.apply(tf.data.experimental.prefetch_to_device('/device:GPU:0'))
    return data

##############################


def debug_parse_function(filename, label):
    img = tf.io.read_file(filename)
    img = tf.io.decode_jpeg(img, channels=channels)#, dtype=tf.float32)
    img = tf.image.resize(img, img_size)
    return img, label, filename #{'image':img, 'label':label}



def debug_get_tf_dataset(filenames, labels):
    data = tf.data.Dataset.from_tensor_slices((filenames, labels))
    data = data.shuffle(len(filenames))
#     data = data.interleave((lambda x, y: tf.data.Dataset(x,y).map(parse_function, num_parallel_calls=1)), cycle_length=4, block_length=16)
    data = data.map(debug_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#     data = data.map(train_preprocess, num_parallel_calls=4)
    data = data.batch(batch_size)
    data = data.prefetch(tf.data.experimental.AUTOTUNE)
    data = data.cache()
#     data = data.apply(tf.data.experimental.prefetch_to_device('/device:GPU:0'))
    return data

##############################

debug = False#True

if debug == True:
    get_tf_dataset = debug_get_tf_dataset






def decode_labels(data_df):
    data_df=data_df.groupby('label', group_keys=False).apply(lambda df: df.sample(1).loc[:,['label','family']])
    data_df.sort_values(by='label', inplace=True)
    data_df.set_index(keys='label',drop=True, inplace=True)
    data_df = data_df.to_dict()
    
    return data_df['family']


# train_dataset = get_tf_dataset(filenames = train_data['path'].values, labels = train_data['label'].values)
# val_dataset = get_tf_dataset(filenames = val_data['path'].values, labels = val_data['label'].values)

train_dataset = get_tf_dataset(filenames = train_data['path'].ravel(), labels = train_data['label'])
val_dataset = get_tf_dataset(filenames = val_data['path'].ravel(), labels = val_data['label'])

label_map = decode_labels(data_df=data_df)

num_samples_train = len(train_data['path'])
num_samples_val = len(val_data['path'])
num_samples_test = len(test_data['path'])
print(num_samples_train)
print(num_samples_val)
print(num_samples_test)

label_map[0]

In [None]:
# for features in  train_dataset.take(1):
#     image_batch = features[0].numpy().astype(np.int)
#     label_batch = features[1].numpy().astype(np.int)

# plot_image_grid = pyleaves.analysis.img_utils.plot_image_grid
    
    
# plot_image_grid(image_batch, label_batch, x_plots = 4, y_plots = 8)

In [None]:
#############################################################################


# datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1.0/255.0)

# train_data['label'] = train_data['label'].astype(str)

# datagen_flow = datagen.flow_from_dataframe(train_data.iloc[:100,:], x_col='path', y_col='label', class_mode='sparse', batch_size=batch_size)

# a=next(datagen_flow)

# print(a[0].shape, a[1].shape)

In [None]:
# import time


# start_time = time.time()
# n=100
# total_time = 0
# try:
#     for i, features in enumerate(train_dataset.take(n)):
# #         print(i, features[0].shape, features[1].shape)
#         run_time = time.time()-start_time
#         total_time += run_time
#         print(f'Took {run_time:.2f} seconds')
#         start_time = time.time()
# except Exception as e:
#     print(e)
#     print(f'finished {i} iterations')

# avg_time = total_time / i+1

# rate = (i+1)*batch_size/total_time

# print(f'Avg time = {avg_time:.2f} | Ran {i+1} iterations using batch size = {batch_size} & {batch_size*n} samples')
# print(f'rate = {rate}')

In [None]:
model = pyleaves.models.inception_v3.build_model(num_classes, learning_rate=learning_rate)

In [None]:
model.summary()

In [None]:
# history = pyleaves.models.inception_v3.train_model(model,
# 				train_dataset,
# 				validation_data=val_dataset, 
# 				steps_per_epoch=int(num_samples_train//batch_size),
# 				validation_steps=int(num_samples_val//batch_size),
# 				max_epochs=num_epochs,
# # 				callbacks=None,
# 				workers=5,
# 				initial_epoch=0)

In [None]:
# train_dataset = train_dataset.make_initializable_iterator().get_next()
# val_dataset = val_dataset.make_initializable_iterator().get_next()

In [None]:
# current_epoch = 0
# while current_epoch < 20:
#     try:
#         history = model.fit(
#                         train_dataset,
#                         validation_data=val_dataset, 
#         #                 steps_per_epoch=int(num_samples_train//batch_size),
#         #                 validation_steps=int(num_samples_val//batch_size),
#                         epochs=num_epochs,
#                         # 				callbacks=None,
#                         workers=10,
#                         initial_epoch=current_epoch,
#                         verbose=1)
#     except KeyboardInterrupt:
#         break
#     except Exception as e:
#         print(f'current epoch = {current_epoch}, error: {e}')
    

In [None]:
# read_data = tf.get_default_session().run(train_dataset)

In [None]:
import time
from tensorflow.python.framework.errors_impl import InvalidArgumentError


filename_ids = []
batch_log = []
invalid_filenames = []

start_time = time.time()
time_log = []
steps_per_epoch = num_samples_train//batch_size

valid_filenames = []

reset_iter = True
current_epoch = 0
while current_epoch < 20:
    if reset_iter == True:
        epoch_dataset = train_dataset.take(steps_per_epoch)
        reset_iter = False
    try:
        for i, (imgs, labels, filenames) in enumerate(epoch_dataset):        
            run_time = time.time()-start_time
            time_log.append(run_time)
            print(f'Took {run_time:.2f} seconds')
            
            valid_filenames.append([fname.numpy().decode('utf-8') for fname in filenames])
            
            start_time = time.time()
    except InvalidArgumentError as e:
        invalid_flag = 0
        for j, fname in enumerate(filenames):
            fname = fname.numpy().decode('utf-8')
            if os.path.isfile(fname):
                filename_ids.append(i*batch_size+j)
                valid_filenames.append(fname)
                continue
            else:
                filename_ids.append(i*batch_size+j)
                invalid_filenames.append(fname)    
                print(f'invalid filename = {fname}')
                invalid_flag = 1
        print(f'current epoch = {current_epoch}, error: {e}', type(e))
        continue

    except KeyboardInterrupt:
        break
    except Exception as e:
        reset_iter = True
        print(f'current epoch = {current_epoch}, error: {e}', type(e))
            
print(f'finished {i*batch_size} samples over {i} iterations in {np.sum(time_log):.2f} seconds')

In [None]:
print(invalid_filenames)

In [None]:
import cv2

plot_image_grid = pyleaves.analysis.img_utils.plot_image_grid

invalid_filenames = np.concatenate([fname.numpy().tolist() for fname in invalid_filenames]).tolist()

for i, fname in enumerate(invalid_filenames):
    if type(fname)==bytes:
        invalid_filenames[i] = fname.decode('utf-8')    
    
    

In [None]:
invalid_imgs = []
for i, fname in enumerate(invalid_filenames):
    img = cv2.imread(fname)
    img = cv2.resize(img, tuple(img_size))
    invalid_imgs.append(img)

    
invalid_images = np.stack(invalid_imgs)

plot_image_grid(invalid_images, labels = np.ones(len(invalid_imgs)), x_plots = 4, y_plots = 8)