In [2]:
# package
import sys
import os
import subprocess

from six import string_types

# Make sure you have all of these packages installed, e.g. via pip
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy
from skimage import io
from scipy import ndimage
from IPython.display import display
import cv2
%matplotlib inline

In [4]:
PLANET_KAGGLE_ROOT = os.path.abspath("./")
PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')
PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')
assert os.path.exists(PLANET_KAGGLE_ROOT)
assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)
assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)

In [5]:
labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)
labels_df.head()
label_list = ['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear',
              'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy',
              'primary', 'road', 'selective_logging', 'slash_burn', 'water']

In [6]:
# Add onehot features for every label
for label in label_list:
    labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
# Display head
labels_df.head()

Unnamed: 0,image_name,tags,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,train_0,haze primary,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,train_1,agriculture clear primary water,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,train_2,clear primary,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,train_3,clear primary,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,train_4,agriculture clear habitation primary road,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0


In [6]:
weather_labels = ['clear', 'partly_cloudy', 'haze', 'cloudy']
land_labels = ['primary', 'agriculture', 'water', 'habitation', 'road', 'cultivation', 'slash_burn', 'conventional_mine', 'bare_ground', 'artisinal_mine', 'blooming', 'selective_logging', 'blow_down']

In [8]:
weather_labels_value = np.transpose(np.float32(labels_df[weather_labels].values))

In [19]:
# validation set
# set 10% samples to validation set
from sklearn.model_selection import train_test_split
img_set = labels_df['image_name'].values
labels_set = labels_df[label_list].values
img_train, img_test, labels_train, labels_test = train_test_split(img_set, labels_set, test_size=0.1, random_state=42)

In [44]:
print len(img_train)
print len(img_test)

36431
4048


In [37]:
# write train set
import csv
with open( './train_validation_v2_bin.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(['image_name'] + label_list)
    for i in xrange(len(img_train)):
        data = [img_train[i]] + labels_train[i].tolist()
        writer.writerow(data)
# write validation set
with open( './validation_train_v2_bin.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(['image_name'] + label_list)
    for i in xrange(len(img_test)):
        data = [img_test[i]] + labels_test[i].tolist()
        writer.writerow(data)

In [13]:
train_set = pd.read_csv('./train_validation_v2_bin.csv')
print "train set:"
for l in weather_labels:
    print l, "tags num:", np.sum(train_set[l])

valid_set = pd.read_csv('./validation_train_v2_bin.csv')
print "\nvalidation set:"
for l in weather_labels:
    print l, "tags num:", np.sum(valid_set[l])
# 可以发现随机抽样抽取的之后的每类样本数目大约是按照原先样本的分布比例分布的

train set:
clear tags num: 25551
partly_cloudy tags num: 6524
haze tags num: 2442
cloudy tags num: 1913

validation set:
clear tags num: 2880
partly_cloudy tags num: 737
haze tags num: 255
cloudy tags num: 176


In [1]:
#train weather label with validation set
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
import random
import math

import vgg16_trainable as vgg16
import read_data
import utils
import csv

from sklearn.metrics import fbeta_score

def f2_score(y_true, y_pred):
    # fbeta_score throws a confusing error if inputs are not numpy arrays
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # We need to use average='samples' here, any other average method will generate bogus results
    return fbeta_score(y_true, y_pred, beta=2, average='samples')

def get_one_hot_pred(prob):
    y_pred = list()
    for p in prob:
        temp = np.zeros(len(p))
        temp[np.argmax(p)] = 1
        y_pred.append(temp)
    return y_pred
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

class Config():
    batch_size = 8
    steps = "-1"
    gpu = '/gpu:0'

    # checkpoint path and filename
    logdir = "./log"
    params_dir = "./params/"
    load_filename = params_dir + "vgg16_weather.npy"
    save_filename = params_dir + "vgg16_weather.npy"

    # path
    imgs_path = "./train-jpg/"
    labels_file = "./train_validation_v2_bin.csv"

    # iterations config
    max_iteration = 1000
    summary_iters = 50
    # refer to synset.txt for the order of labels
    # 6: clear, 7: cloudy, 11: haze, 12:partly_cloudy
    usecols = [6, 7, 11, 12]
config = Config()
reader = read_data.Reader(config)

validation_config = Config()
validation_config.labels_file = "./validation_train_v2_bin.csv"
validation_reader = read_data.Reader(validation_config)

images = tf.placeholder(tf.float32, [None, 224, 224, 3])
true_out = tf.placeholder(tf.float32, [None, len(config.usecols)])
train_mode = tf.placeholder(tf.bool)

vgg = vgg16.Vgg16(config.load_filename, output_size=len(config.usecols))
vgg.build(images, train_mode)
print vgg.get_var_count() , "variables"
with tf.name_scope('loss'):
    cost = tf.reduce_mean(-tf.reduce_sum(true_out * tf.log(vgg.prob), [1]))
    tf.summary.scalar('loss', cost)
    valid_f2_score = 0
    tf.summary.scalar('validf2_score', valid_f2_score)
with tf.name_scope('train'):
    rate = 1e-3
    train = tf.train.GradientDescentOptimizer(rate).minimize(cost)
    tf.summary.scalar('learning_rate', rate)
    tf.summary.scalar('batch_size', config.batch_size)
    
    merged = tf.summary.merge_all()

with tf.device(config.gpu):    
    sess = tf.Session()
    writer = tf.summary.FileWriter(config.logdir, sess.graph)
    sess.run(tf.global_variables_initializer())

    print "start training"
    # start training
    for idx in xrange(config.max_iteration):
        imgs, labels = reader.random_batch()
        # feed data into the model
        feed_dict = {
            images : imgs,
            true_out : labels,
            train_mode : True
        }
        sess.run(train, feed_dict=feed_dict)
        if  idx % 50 == 0:
            result = sess.run(merged, feed_dict=feed_dict)
            loss = sess.run(cost, feed_dict=feed_dict)
                        
            print idx, "cost:", loss
            writer.add_summary(result, idx)
            if idx % 500 == 0:
                valid_pred = []
                valid_true_out = []
                for x in  xrange(np.int32(np.ceil(4048/config.batch_size))):
                    valid_img, valid_label = validation_reader.batch()
                    valid_feed_dict = {
                        images : valid_img,
                        true_out: valid_label,
                        train_mode : False
                    }
                    valid_prob = sess.run(vgg.prob, feed_dict=valid_feed_dict)
                    valid_pred = np.append(valid_pred, get_one_hot_pred(valid_prob))
                    valid_true_out = np.append(valid_true_out, valid_label)
                valid_pred = np.reshape(valid_pred,[-1, len(config.usecols)])
                valid_true_out = np.reshape(valid_true_out, [-1, len(config.usecols)])
                valid_f2_score = f2_score(valid_true_out, valid_pred)
                print "validation_f2_score:", valid_f2_score
                vgg.save_npy(sess, config.save_filename)


conv5_1 (3, 3, 512, 512) (512,)
fc6 (25088, 4096) (4096,)
conv5_3 (3, 3, 512, 512) (512,)
conv5_2 (3, 3, 512, 512) (512,)
fc8 (4096, 1000) (1000,)
fc9 (1000, 4) (4,)
fc7 (4096, 4096) (4096,)
conv4_1 (3, 3, 256, 512) (512,)
conv4_2 (3, 3, 512, 512) (512,)
conv4_3 (3, 3, 512, 512) (512,)
conv3_3 (3, 3, 256, 256) (256,)
conv3_2 (3, 3, 256, 256) (256,)
conv3_1 (3, 3, 128, 256) (256,)
conv1_1 (3, 3, 3, 64) (64,)
conv1_2 (3, 3, 64, 64) (64,)
conv2_2 (3, 3, 128, 128) (128,)
conv2_1 (3, 3, 64, 128) (128,)
138361548 variables
start training
0 cost: 0.115634
validation_f2_score: 0.908349802372
('file saved', './params/vgg16_weather.npy')
50 cost: 0.0792168
100 cost: 0.0519398
150 cost: 0.029727
200 cost: 0.0590279
250 cost: 0.16637
300 cost: 0.356356
350 cost: 0.15582
400 cost: 0.0381264
450 cost: 0.0455907
500 cost: 0.0641322
validation_f2_score: 0.935770750988
('file saved', './params/vgg16_weather.npy')
550 cost: 0.0907503
600 cost: 0.0607102
650 cost: 0.345959
700 cost: 0.0267451
750 cost: 0

In [1]:
#train without validation set
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
import random
import math

import vgg16_trainable as vgg16
import read_data
import utils
import csv

from sklearn.metrics import fbeta_score

def f2_score(y_true, y_pred):
    # fbeta_score throws a confusing error if inputs are not numpy arrays
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # We need to use average='samples' here, any other average method will generate bogus results
    return fbeta_score(y_true, y_pred, beta=2, average='samples')

def get_one_hot_pred(prob):
    y_pred = list()
    for p in prob:
        temp = np.zeros(len(p))
        temp[np.argmax(p)] = 1
        y_pred.append(temp)
    return y_pred
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

class Config():
    batch_size = 8
    steps = "-1"
    gpu = '/gpu:0'

    # checkpoint path and filename
    logdir = "./log"
    params_dir = "./params/"
    load_filename = params_dir + "vgg16_weather.npy"
    save_filename = params_dir + "vgg16_weather.npy"

    # path
    imgs_path = "./train-jpg/"
    labels_file = "./train_v2_bin.csv"

    # iterations config
    max_iteration = 501
    summary_iters = 50
    # refer to synset.txt for the order of labels
    # 6: clear, 7: cloudy, 11: haze, 12:partly_cloudy
    usecols = [6, 7, 11, 12]
config = Config()
reader = read_data.Reader(config)

validation_config = Config()
validation_config.labels_file = "./validation_train_v2_bin.csv"
validation_reader = read_data.Reader(validation_config)

images = tf.placeholder(tf.float32, [None, 224, 224, 3])
true_out = tf.placeholder(tf.float32, [None, len(config.usecols)])
train_mode = tf.placeholder(tf.bool)

vgg = vgg16.Vgg16(config.load_filename, output_size=len(config.usecols))
vgg.build(images, train_mode)
print vgg.get_var_count() , "variables"
with tf.name_scope('loss'):
    cost = tf.reduce_mean(-tf.reduce_sum(true_out * tf.log(vgg.prob), [1]))
    tf.summary.scalar('loss', cost)
    valid_f2_score = 0
    tf.summary.scalar('validf2_score', valid_f2_score)
with tf.name_scope('train'):
    rate = 1e-3
    train = tf.train.GradientDescentOptimizer(rate).minimize(cost)
    tf.summary.scalar('learning_rate', rate)
    tf.summary.scalar('batch_size', config.batch_size)
    
    merged = tf.summary.merge_all()

with tf.device(config.gpu):    
    sess = tf.Session()
    writer = tf.summary.FileWriter(config.logdir, sess.graph)
    sess.run(tf.global_variables_initializer())

    print "start training"
    # start training
    for idx in xrange(config.max_iteration):
        imgs, labels = reader.random_batch()
        # feed data into the model
        feed_dict = {
            images : imgs,
            true_out : labels,
            train_mode : True
        }
        sess.run(train, feed_dict=feed_dict)
        if  idx % 50 == 0:
            result = sess.run(merged, feed_dict=feed_dict)
            loss = sess.run(cost, feed_dict=feed_dict)
                        
            print idx, "cost:", loss
            writer.add_summary(result, idx)
            if idx % 500 == 0:
                valid_pred = []
                valid_true_out = []
                for x in  xrange(np.int32(np.ceil(4048/config.batch_size))):
                    valid_img, valid_label = validation_reader.batch()
                    valid_feed_dict = {
                        images : valid_img,
                        true_out: valid_label,
                        train_mode : False
                    }
                    valid_prob = sess.run(vgg.prob, feed_dict=valid_feed_dict)
                    valid_pred = np.append(valid_pred, get_one_hot_pred(valid_prob))
                    valid_true_out = np.append(valid_true_out, valid_label)
                valid_pred = np.reshape(valid_pred,[-1, len(config.usecols)])
                valid_true_out = np.reshape(valid_true_out, [-1, len(config.usecols)])
                valid_f2_score = f2_score(valid_true_out, valid_pred)
                print "validation_f2_score:", valid_f2_score
                vgg.save_npy(sess, config.save_filename)

conv5_1 (3, 3, 512, 512) (512,)
fc6 (25088, 4096) (4096,)
conv5_3 (3, 3, 512, 512) (512,)
conv5_2 (3, 3, 512, 512) (512,)
fc8 (4096, 1000) (1000,)
fc9 (1000, 4) (4,)
fc7 (4096, 4096) (4096,)
conv4_1 (3, 3, 256, 512) (512,)
conv4_2 (3, 3, 512, 512) (512,)
conv4_3 (3, 3, 512, 512) (512,)
conv3_3 (3, 3, 256, 256) (256,)
conv3_2 (3, 3, 256, 256) (256,)
conv3_1 (3, 3, 128, 256) (256,)
conv1_1 (3, 3, 3, 64) (64,)
conv1_2 (3, 3, 64, 64) (64,)
conv2_2 (3, 3, 128, 128) (128,)
conv2_1 (3, 3, 64, 128) (128,)
138361548 variables
start training
0 cost: 0.0174124
validation_f2_score: 0.9375
('file saved', './params/vgg16_weather.npy')
50 cost: 0.0341956
100 cost: 0.399679
150 cost: 0.0184284
200 cost: 0.0593757
250 cost: 0.0116394
300 cost: 0.00857606
350 cost: 0.0663076
400 cost: 0.158414
450 cost: 0.0911786
500 cost: 0.158284
validation_f2_score: 0.936017786561
('file saved', './params/vgg16_weather.npy')
