# The Nature Conservancy Fisheries Monitoring

In [10]:
import numpy as np
import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
import pylab as plt
import tensorflow as tf
from sklearn.metrics import log_loss
#from sklearn.cross_validation import train_test_split
warnings.filterwarnings("ignore")



## Functions

In [11]:
def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (48, 48), interpolation = cv2.INTER_LINEAR)
    return resized


def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('..', '/notebooks/notebooks/Fish classification/', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)
     
    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id


def load_test():
    path = os.path.join('..', '/notebooks/notebooks/Fish classification/', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))

    X_test = []
    X_test_id = []
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)

    return X_test, X_test_id


def split_train_test(train_features, train_labels):
    np.random.seed(2017)
    rand = np.random.choice(train_target.shape[0], train_target.shape[0])

    #X_train_id = []
    #X_test_id = []

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    for i in rand[0:2840]:
        #X_train_id.append(train_id[i])
        X_train.append(train_features[i])
        Y_train.append(train_target[i])

    for i in rand[2840:len(rand)]:
        #X_test_id.append(train_id[i])
        X_test.append(train_features[i])
        Y_test.append(train_target[i]) 
       
    print('Train set: {} images {}'.format(len(X_train),X_train[0].shape))
    print('Test set: {} images {}'.format(len(X_test),X_test[0].shape))
    
    return X_train, X_test, Y_train, Y_test


def next_batch(batch_size,X,Y):
    
    idx = np.random.permutation(len(X))
    X_shuffle =[X[i] for i in idx]
    Y_shuffle =[Y[i] for i in idx]
    
    X_batch = X_shuffle[0:batch_size]
    Y_batch = Y_shuffle[0:batch_size] 
    
    return X_batch, Y_batch


def create_submission(predictions, ID, name):
    sub = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
    sub.loc[:, 'image'] = pd.Series(ID, index=sub.index)
    sub.to_csv(name, index=False)

## Load training data

In [12]:
train_features, train_target, train_id = load_train()
train_target = pd.get_dummies(train_target).as_matrix()

Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 62.44 seconds


## Split training data OR load test data

In [51]:
X_train, X_test, Y_train, Y_test = split_train_test(train_features, train_target)
#test_features, test_id = load_test()

# Tensorflow run

In [52]:
X = tf.placeholder(tf.float32, [None, 48, 48, 3])
Y_ = tf.placeholder(tf.float32, [None, 8])

# 3 convolutional layers 
K = 4  # first 
L = 8  # second 
M = 12  # third 
N = 200  # fully connected layer

W1 = tf.Variable(tf.truncated_normal([5, 5, 3, K], stddev=0.1))  # 5x5 patch, 1 input channel, K output channels
B1 = tf.Variable(tf.ones([K])/8)
W2 = tf.Variable(tf.truncated_normal([5, 5, K, L], stddev=0.1))
B2 = tf.Variable(tf.ones([L])/8)
W3 = tf.Variable(tf.truncated_normal([4, 4, L, M], stddev=0.1))
B3 = tf.Variable(tf.ones([M])/8)

W4 = tf.Variable(tf.truncated_normal([12 * 12 * M, N], stddev=0.1))
B4 = tf.Variable(tf.ones([N])/8)
W5 = tf.Variable(tf.truncated_normal([N, 8], stddev=0.1))
B5 = tf.Variable(tf.ones([8])/8)


# The model
stride = 1  # output is 48*48
Y1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
stride = 2  # output is 24*24
Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') + B2)
stride = 2  # output is 12*12
Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') + B3)

# reshape the output from the third convolution for the fully connected layer
YY = tf.reshape(Y3, shape=[-1, 12 * 12 * M])

Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
Ylogits = tf.matmul(Y4, W5) + B5
Y = tf.nn.softmax(Ylogits)

# Entropy
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(Ylogits, Y_)
cross_entropy = tf.reduce_mean(cross_entropy)*100

# accuracy 
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
                    
# Log loss
logloss = tf.contrib.losses.log_loss(Y, Y_)
loss = -tf.reduce_sum(Y_ * tf.log(Y)) / 937

# training step
learning_rate = 0.05

# training step, the learning rate is a placeholder
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)

# init
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)


## Running Tensorflow : Train / Test data

In [50]:
ITERATION = 100
DISPLAY = True
DISPLAY_ITER = 10 
DISPLAY_STEP = ITERATION / DISPLAY_ITER
BATCH_SIZE = 100

# Training
for iteration in range(ITERATION):
    batch_X, batch_Y = next_batch(BATCH_SIZE, X_train,Y_train)    
    sess.run(train_step, feed_dict={X: batch_X, Y_: batch_Y})
    
    if DISPLAY and (iteration % DISPLAY_STEP == 0 and iteration != 0):
        print(iteration)
 
print('accuracy test',sess.run(accuracy, feed_dict={X: X_test, Y_: Y_test}))
print('logloss test',sess.run(logloss, feed_dict={X: X_test, Y_: Y_test}))
output = sess.run(Y, feed_dict={X: X_test})
print('loss test',sess.run(loss, feed_dict={X: X_test, Y_: Y_test}))


10
20
30
40
50
60
70
80
90
('accuracy test', 0.43970117)
('logloss test', 0.31252381)
('loss test', 1.6514047)


## Running Tensorflow : All data

In [53]:
ITERATION = 100
DISPLAY = True
DISPLAY_ITER = 10 
DISPLAY_STEP = ITERATION / DISPLAY_ITER
BATCH_SIZE = 100

# Training
for iteration in range(ITERATION):
    batch_X, batch_Y = next_batch(BATCH_SIZE, train_features,train_target)    
    sess.run(train_step, feed_dict={X: batch_X, Y_: batch_Y})
    
    if DISPLAY and (iteration % DISPLAY_STEP == 0 and iteration != 0):
        print(iteration)
 
print('accuracy test',sess.run(accuracy, feed_dict={X: train_features, Y_: train_target}))
print('logloss test',sess.run(logloss, feed_dict={X: train_features, Y_: train_target}))
# test_features, test_id
output = sess.run(Y, feed_dict={X: test_features})
print('loss test',sess.run(loss, feed_dict={X: train_features, Y_: train_target}))

10
20
30
40
50
60
70
80
90
('accuracy test', 0.45512313)
('logloss test', 0.30905148)
('loss test', 6.5570149)


# Create submission

In [156]:
create_submission(output,test_id,'sub1.csv')