### Imports

In [3]:
import numpy as np
import pandas as pd
import pickle
import heapq
import xgboost as xgb
import h5py
import time
from tqdm import tqdm_notebook as tqdm

from keras import backend as K
from keras.models import Model, load_model
from keras.models import model_from_json
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler

import tensorflow as tf

random_seed = 54321
np.random.seed(random_seed)

Using TensorFlow backend.


In [2]:
tf.set_random_seed(random_seed)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
K.set_session(session)
K.set_image_dim_ordering('tf')

### Reload models and data

In [3]:
## load json and create model
#with open("models/model_denoise.json", "r") as json_file:
#    loaded_model_json = json_file.read()
#model_denoise = model_from_json(loaded_model_json)
## load weights into new model
#with open('models/model_denoise_weights_tt.pickle', 'rb') as handle:
#    model_weights = pickle.load(handle)
#model_denoise.set_weights(model_weights)
#print("Loaded model from disk")

Loaded model from disk


In [4]:
# load model from JSON - don't care about the weights rith now, they are saved separately
with open("models/model.json", "r") as json_file:
    loaded_model_json = json_file.read()
    model_f = model_from_json(loaded_model_json)

In [5]:
class ModelHistory(Callback):
    def __init__(self, listSize=10):
        self.listSize = listSize
        self.models = []

    def on_epoch_end(self, epoch, logs={}):
        lastLoss = logs.get('val_loss')
        rank = 1 - lastLoss
        if len(self.models) > 0:
            if rank > self.models[0][0]: # new model is better than the worst in the heap
                if len(self.models) >= self.listSize: #if the model heap is already full
                    heapq.heappushpop(self.models, (rank, lastLoss, self.model.get_weights()))
                else:
                    heapq.heappush(self.models, (rank, lastLoss, self.model.get_weights()))
        else:
            heapq.heappush(self.models, (rank, lastLoss, self.model.get_weights()))

In [6]:
modelEnsemble = ModelHistory(listSize=26)
with open('models/modelEnsemble.pickle', 'rb') as handle:
    modelEnsemble.models = pickle.load(handle)

EOFError: Ran out of input

In [7]:
with open('models/modelXgb4.pickle', 'rb') as handle:
    xgb4 = pickle.load(handle)

### Load Test Data

In [129]:
test_df = pd.read_json("Data/test/test.json")

In [9]:
test_df.head()

Unnamed: 0,band_1,band_2,id,inc_angle
0,"[-15.863251, -15.201077, -17.887735, -19.17248...","[-21.629612, -21.142353, -23.908337, -28.34524...",5941774d,34.9664
1,"[-26.058969497680664, -26.058969497680664, -26...","[-25.754207611083984, -25.754207611083984, -25...",4023181e,32.615072
2,"[-14.14109992980957, -15.064241409301758, -17....","[-14.74563980102539, -14.590410232543945, -14....",b20200e4,37.505433
3,"[-12.167478, -13.706167, -16.54837, -13.572674...","[-24.32222, -26.375538, -24.096739, -23.8769, ...",e7f018bb,34.4739
4,"[-23.37459373474121, -26.02718162536621, -28.1...","[-25.72234344482422, -27.011577606201172, -23....",4371c8c3,43.918874


In [10]:
def get_bands(train_df):
    max_col = np.array(train_df.apply(lambda x: max((max(train_df.loc[x.name,'band_1']),max(train_df.loc[x.name,'band_2']))),axis=1)) - 10
    max_col2 = max_col.reshape(-1,1) * np.ones(75*75).reshape(1,75*75)
    max_col2 = max_col2.reshape(-1,75,75)

    band_1 = np.array(train_df['band_1'].tolist()).reshape(-1,75,75) - max_col2
    band_2 = np.array(train_df['band_2'].tolist()).reshape(-1,75,75) - max_col2
    band_1_t = 10**(band_1/10)
    band_2_t = 10**(band_2/10)
    band_1_t = np.where(band_1_t > 0.01, band_1_t, 0)
    band_2_t = np.where(band_2_t > 0.01, band_2_t, 0)
    band_3 = band_1_t - band_2_t
    X = np.stack((band_1,band_2,band_1_t,band_2_t),axis=3)
    
    return band_1, band_2, band_1_t, band_2_t, band_3, X

In [11]:
test_df['inc_angle_f'] = pd.to_numeric(test_df['inc_angle'], errors='coerce')
print("missing values in inc_angle: ", test_df['inc_angle_f'].isnull().sum())
test_df['inc_angle_f'].replace(np.nan,0, inplace=True)
test_df.tail()

missing values in inc_angle:  0


Unnamed: 0,band_1,band_2,id,inc_angle,inc_angle_f
8419,"[-25.082357, -26.71583, -24.599827, -25.082571...","[-25.860718, -23.29442, -25.860861, -25.334354...",16ee9b50,34.7955,34.7955
8420,"[-21.031391143798828, -21.031391143798828, -21...","[-23.755836486816406, -23.755836486816406, -23...",5a599eb7,32.246683,32.246683
8421,"[-28.609278, -26.514626, -26.514679, -26.83061...","[-28.609278, -29.437183, -30.35239, -31.375494...",df30d6dd,39.5032,39.5032
8422,"[-27.068821, -27.068892, -23.970854, -22.38730...","[-29.991381, -29.163599, -24.886002, -27.71266...",18af95b1,33.638,33.638
8423,"[-25.438865661621094, -25.438865661621094, -25...","[-23.85527801513672, -23.85527801513672, -23.8...",27d788c8,36.758181,36.758181


In [12]:
_, _, _, _, _, X_test = get_bands(test_df)
y_angle_test = test_df.loc[:,['is_iceberg','inc_angle_f']]
y_angle_test['index'] = y_angle_test.index

In [13]:
del(test_df)

In [46]:
def get_prediction(model,weights, X, y):
    model.set_weights(weights)
    return model.predict_generator(datagen_angle_val.flow(X, y, batch_size=32, shuffle=False), 
                           steps = len(X)/31, verbose=0)

In [15]:
datagen_val = ImageDataGenerator(
    samplewise_center=False,
    samplewise_std_normalization=False,
    rotation_range=0,
    horizontal_flip=False,
    vertical_flip=False,
    fill_mode='nearest')

#custom generator for fit_generator
from collections import Generator
class Datagen_angle(Generator):
    def __init__(self, imagegen=ImageDataGenerator):
        self.imagegen = imagegen
        
    def flow(self, x, y, batch_size=8, shuffle=True):
        self.generator = self.imagegen.flow(x, y, batch_size=batch_size, shuffle=shuffle)
        return self
    
    def send(self, ignored):
        temp_data = next(self.generator)
        temp_band_3 = temp_data[0][:,:,:,2] - temp_data[0][:,:,:,3] #band_1_t - band_2_t
        temp_stacked1 = np.stack((temp_data[0][:,:,:,0],temp_data[0][:,:,:,1]),axis=3)
        temp_stacked2 = np.stack((temp_data[0][:,:,:,2],temp_data[0][:,:,:,3],temp_band_3),axis=3)
        #nn_denoised_temp = temp_data[0] #pass 4 bands for nn denoising input
        return [temp_stacked1, temp_stacked2, 
                #nn_denoised_temp,
                temp_data[1][:,1]], temp_data[1][:,0]
    
    def throw(self, type=None, value=None, traceback=None):
        raise StopIteration
    

datagen_val.fit(X_test)

datagen_angle_val = Datagen_angle(imagegen=datagen_val)

In [31]:
len(modelEnsemble.models)

26

In [50]:
idx = 0
pred = get_prediction(model_f, modelEnsemble.models[idx][2], X_test, y_angle_test)[:X_test.shape[0]]
pred = np.array(pred)
dataset_name = 'ensemble_data_%02d' % idx
with h5py.File('tmp_data/ensemble_test_data.hd5', 'w') as hf:
    hf.create_dataset(dataset_name,  data=pred)
idx=1
for i in tqdm(range(idx,idx+9), ascii=True):
    idx = i
    model = modelEnsemble.models[idx]
    pred = get_prediction(model_f, model[2], X_test, y_angle_test)[:X_test.shape[0]]
    pred = np.array(pred)
    dataset_name = 'ensemble_data_%02d' % idx
    with h5py.File('tmp_data/ensemble_test_data.hd5', 'a') as hf:
        hf.create_dataset(dataset_name,  data=pred)

100%|##################################################################################| 9/9 [02:49<00:00, 18.79s/it]


In [92]:
idx = 10
pred = get_prediction(model_f, modelEnsemble.models[idx][2], X_test, y_angle_test)[:X_test.shape[0]]
pred = np.array(pred)
dataset_name = 'ensemble_data_%02d' % idx
with h5py.File('tmp_data/ensemble_test_data.hd5', 'a') as hf:
    hf.create_dataset(dataset_name,  data=pred)

In [51]:
#idx2=2
#with h5py.File('tmp_data/ensemble_test_data.hd5', 'r') as hf:
#    ensemble_test = [hf['ensemble_data_%02d' % idx2][:]]

In [53]:
idx

9

In [55]:
idx += 1
for i in tqdm(range(idx,idx+9), ascii=True):
    idx = i
    model = modelEnsemble.models[idx]
    pred = get_prediction(model_f, model[2], X_test, y_angle_test)[:X_test.shape[0]]
    pred = np.array(pred)
    dataset_name = 'ensemble_data_%02d' % idx
    with h5py.File('tmp_data/ensemble_test_data.hd5', 'a') as hf:
        hf.create_dataset(dataset_name,  data=pred)


  0%|                                                                                          | 0/9 [00:00<?, ?it/s]
Exception in thread Thread-64:
Traceback (most recent call last):
  File "D:\Anaconda3\envs\tf-gpu\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "D:\Anaconda3\envs\tf-gpu\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "D:\Anaconda3\envs\tf-gpu\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|##################################################################################| 9/9 [02:49<00:00, 18.83s/it]


In [106]:
idx += 1
for i in tqdm(range(idx,len(modelEnsemble.models)), ascii=True):
    idx = i
    model = modelEnsemble.models[idx]
    pred = get_prediction(model_f, model[2], X_test, y_angle_test)[:X_test.shape[0]]
    pred = np.array(pred)
    dataset_name = 'ensemble_data_%02d' % idx
    with h5py.File('tmp_data/ensemble_test_data.hd5', 'a') as hf:
        hf.create_dataset(dataset_name,  data=pred)


  0%|                                                                                          | 0/4 [00:00<?, ?it/s]
Exception in thread Thread-75:
Traceback (most recent call last):
  File "D:\Anaconda3\envs\tf-gpu\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "D:\Anaconda3\envs\tf-gpu\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "D:\Anaconda3\envs\tf-gpu\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|##################################################################################| 4/4 [01:14<00:00, 18.53s/it]


In [None]:
#pred = get_prediction(model_f, modelEnsemble.models[11][2], X_test, y_angle_test)[:X_test.shape[0]]
#pred = np.array(pred)
#dataset_name = 'ensemble_data_%02d' % 11
#with h5py.File('tmp_data/ensemble_test_data.hd5', 'a') as hf:
#    hf.create_dataset(dataset_name,  data=pred)

In [103]:
idx=21

In [114]:
idx2=25
with h5py.File('tmp_data/ensemble_test_data.hd5', 'r') as hf:
    ensemble_test = [hf['ensemble_data_%02d' % idx2][:]]

In [115]:
with h5py.File('tmp_data/ensemble_test_data.hd5', 'r') as hf:
    ensemble_test_list = [hf['ensemble_data_%02d' % idx2][:] for idx2 in tqdm(range(0,len(modelEnsemble.models)), ascii=True)]

100%|##############################################################################| 26/26 [00:00<00:00, 2165.23it/s]


In [125]:
ensemble_test = np.array(ensemble_test_list)
ensemble_test = np.swapaxes(ensemble_test,0,1)
ensemble_test = ensemble_test.reshape(ensemble_test.shape[0],ensemble_test.shape[1])
ensemble_test.shape

(8424, 26)

In [127]:
pseudo_labels = xgb4.predict(ensemble_test)
test_probs = xgb4.predict_proba(ensemble_test)
predictions = test_probs

In [136]:
predictions[:,1]

array([ 0.02090923,  0.12176685,  0.02090923, ...,  0.02090923,
        0.97567028,  0.97567028], dtype=float32)

In [135]:
submission = pd.DataFrame({'id': test_df['id'], 'is_iceberg': predictions[:,1]})
submission.head(10)
submission.to_csv("submission.ensemble.xgboost.v24.csv", index=False)