In [1]:
# General data manipulation
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import os
from os import listdir
from os.path import isfile, join 
import sys
import time
from PIL import Image # pip install pillow
from keras.utils import np_utils
os.environ['KMP_DUPLICATE_LIB_OK']='True'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import keras.backend as K
import tensorflow as tf

def categorical_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha*((1-p)^gamma)*log(p)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediction value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate cross entropy
        cross_entropy = -y_true*K.log(y_pred)
        # Calculate weight that consists of  modulating factor and weighting factor
        weight = alpha * y_true * K.pow((1-y_pred), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss
    
    return focal_loss

In [3]:
# convert PIL.Image object to numpy.Array, for training
def img2arr(img):
    return np.asarray(img.getdata(), dtype=np.uint8).reshape(img.height, img.width, -1)

def read_train_pickle(Type, Energy, DIR = os.getcwd() + '/idao_dataset/'):
    return pd.read_pickle(DIR + 'train_pickle/' + Type + str(Energy) + '.pkl')

In [4]:
from keras.models import load_model
def predict(img_array, ids, isTrain):
    isTrain_array = np.array([isTrain]*len(img_array)).reshape((len(img_array), 1))
    isHigh = is_high_model.predict(img_array)
    isLow = is_low_model.predict(img_array)
    energy_1vs3 = energy_1vs3_model.predict([img_array, isTrain_array])
    energy_6vs10 = energy_6vs10_model.predict([img_array, isTrain_array])
    energy_20vs30 = energy_20vs30_model.predict([img_array, isTrain_array])
    if isTrain:
        ignore_i = [1,2]
    else:
        ignore_i = [0,3]
        
    energy_1vs3[:,ignore_i[0]] = 0.0
    energy_1vs3[:,ignore_i[1]] = 0.0
    energy_6vs10[:,ignore_i[0]] = 0.0
    energy_6vs10[:,ignore_i[1]] = 0.0
    energy_20vs30[:,ignore_i[0]] = 0.0
    energy_20vs30[:,ignore_i[1]] = 0.0
    
    energy_1vs3_i = np.argmax(energy_1vs3, axis=-1)
    energy_6vs10_i = np.argmax(energy_6vs10, axis=-1)
    energy_20vs30_i = np.argmax(energy_20vs30, axis=-1)
    
    pred = pd.DataFrame()
    for i in range(len(img_array)):
        if isHigh[i].round():
            cur_i = energy_20vs30_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*10+20]
        elif isLow[i].round():
            cur_i = energy_1vs3_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*2+1]
        else:
            cur_i = energy_6vs10_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*4+6]
        pred = pred.append(pd.DataFrame({'id': ids[i], 'classification_predictions': Type, 'regression_predictions': Energy,
                                         'isHigh': isHigh[i],
                                         'isLow': isLow[i],
                                         'energy_1vs3': [energy_1vs3[i]],
                                         'energy_6vs10': [energy_6vs10[i]],
                                         'energy_20vs30': [energy_20vs30[i]]}, index=[i]))
            
    return pred

In [5]:
from keras.models import load_model
is_high_model = load_model('Joe_model_high_01.h5')
is_low_model = load_model('Joe_model_low_01.h5')
energy_1vs3_model = load_model('Joe_model_Energy_1vs3_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=5.0,alpha=1.0)})
energy_6vs10_model =  load_model('Joe_model_Energy_6vs10_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=4.0,alpha=1.0)})
energy_20vs30_model = load_model('Joe_model_Energy_20vs30_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=3.0,alpha=1.0)})


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [7]:
pub_ids[:5]

['0c2b855c9bbdd1513b826ebdbd157ee8afa3faa9',
 '8f862af208004bc8f20501a122305ae9fff7d984',
 '60f8f137f6c65b98e5f0908844c6d60522597cca',
 '7c60c5e2b3be0ed88074bdfa611d66351183facd',
 'ac46be871300a72f0cdeca4c886bad9d0e1f5770']

In [25]:
from scipy.ndimage import gaussian_gradient_magnitude, grey_closing
# N = 1500
k = 64
img_arr_list = []
for f in pub_file_list:
    img = Image.open(f).convert('LA')
    img = img2arr(img)[:,:,0]
    img = img[(288-k):(288+k), (288-k):(288+k)].astype(float)
    img -= np.median(img, axis=0)
    img = grey_closing(gaussian_gradient_magnitude(img,5), 9)
    img_arr_list.append(img)
pub_img_array = np.array(img_arr_list).reshape(len(img_arr_list),128,128,1)

In [10]:
len(pub_file_list)

1502

In [26]:
pub_result = predict(pub_img_array, pub_ids, 1)

In [195]:
len(pub_result)

1502

In [None]:
pub_result.to_pickle(os.getcwd()+'/pub_result_01.pkl')

In [8]:
pub_result = pd.read_pickle(os.getcwd()+'/pub_result_01.pkl')

In [9]:
in_sample = {'ER': [3,10,30], 'NR': [1,6,20]}
for i in range(len(pub_result)):
    r = pub_result.iloc[i]
    pred_type, pred_energy = [None, None]
    max_prob = -1
    for Type in in_sample:
        for Energy in in_sample[Type]:
            j = (Type=='ER')*2 + int(Energy in [3,10,30])
            if Energy >= 20:
                cur_prob = r.energy_20vs30[j]*r.isHigh
            elif Energy <= 3:
                cur_prob = r.energy_1vs3[j]*r.isLow
            else:
                cur_prob = r.energy_6vs10[j]*(1-r.isHigh)*(1-r.isLow)
            if int(Type=='ER') == r.classification_predictions and Energy == r.regression_predictions:
                pred_prob = cur_prob
            if cur_prob > max_prob:
                max_prob = cur_prob
                pred_type, pred_energy = [int(Type=='ER'), Energy]
    if max_prob > pred_prob:
        pub_result.loc[i,'classification_predictions'] = pred_type
        pub_result.loc[i,'regression_predictions'] = pred_energy
        print(i,pred_prob,max_prob)
        print(pred_type,pred_energy)

384 0.22841042764130434 0.42335057
1 3
921 0.24723772474743233 0.4501696
1 3
1327 0.350288 0.46252377501900516
0 6


In [10]:
from scipy.special import softmax

In [11]:
pub_high = pub_result[pub_result.regression_predictions>=20].copy()
pub_high['prob'] = [softmax(x[[0,3]])[1] for x in pub_high.energy_20vs30]
pub_high = pub_high.sort_values('prob').reset_index(drop=True)
i = sum(pub_high.prob.round())
j = len(pub_high) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pub_high.loc[:i+1,'prob'] = 0
pub_high.loc[j:,'prob'] = 1
print([x for x in pub_high.prob if x>0 and x<1])
pub_high['classification_predictions'] = pub_high.prob
pub_high['regression_predictions'] = pub_high.prob*10+20

247 252
[0.3451462388038635, 0.3968973755836487, 0.4234057664871216]


In [12]:
from scipy.special import softmax
pub_low = pub_result[pub_result.regression_predictions<=3].copy()
pub_low['prob'] = [softmax(x[[0,3]])[1] for x in pub_low.energy_1vs3]
pub_low = pub_low.sort_values('prob').reset_index(drop=True)
i = sum(pub_low.prob.round())
j = len(pub_low) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pub_low.loc[:i+1,'prob'] = 0
pub_low.loc[j:,'prob'] = 1
print([x for x in pub_low.prob if x>0 and x<1])
pub_low['classification_predictions'] = pub_low.prob
pub_low['regression_predictions'] = pub_low.prob*2+1

250 252
[]


In [14]:
pub_mid = pub_result[(pub_result.regression_predictions >= 6)&(pub_result.regression_predictions <= 10)].copy()
pub_mid['prob'] = [softmax(x[[0,3]])[1] for x in pub_mid.energy_6vs10]
pub_mid = pub_mid.sort_values('prob').reset_index(drop=True)
i = sum(pub_mid.prob.round())
j = len(pub_mid) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pub_mid.loc[:i+1,'prob'] = 0
pub_mid.loc[j:,'prob'] = 1
print([x for x in pub_mid.prob if x>0 and x<1])
pub_mid['classification_predictions'] = pub_mid.prob
pub_mid['regression_predictions'] = pub_mid.prob*4+6

250 251
[]


In [19]:
pub_final_result = pd.concat([pub_low,pub_mid,pub_high])

In [93]:
submit = pd.read_csv(os.getcwd()+'/submit_01.csv')

In [98]:
submit = pub_final_result[['id', 'classification_predictions', 'regression_predictions']].append(submit[1502:])

In [209]:
submit = pub_final_result[['id', 'classification_predictions', 'regression_predictions']]
submit = submit.append(pd.read_csv(os.getcwd()+'/pri_result_01.csv'))
submit

Unnamed: 0,id,classification_predictions,regression_predictions
0,3ea64e9c143efc1b3752cb10db509674fe594759,0.0,1.0
1,1e08f1e9a8c611a99034900d7e71254f3e0bb0e9,0.0,1.0
2,5c1a295acf4240f8a73fffd548928104b5a85eba,0.0,1.0
3,35e47bc9b33ddf2efe63d450a14bbf70601293dd,0.0,1.0
4,1464c83d09e74799f639cd1b100cc048941b018d,0.0,1.0
...,...,...,...
15053,c047067242e05bb8ab79ae2d96960a18c46cf320,0.0,30.0
15054,64224acd4f8ee117da53c98273a9209ab90a96f4,0.0,10.0
15055,70acc768f4d51344ab7b2835d2e2fed8d78d869b,0.0,3.0
15056,c96e163debab5406b9c1b72dd98724004aca5fb8,1.0,1.0


In [210]:
submit.to_csv(os.getcwd()+'/submit_03.csv', index=False)

In [21]:
sum(pub_final_result.prob)

748.1654493808746

In [32]:
test = pd.read_pickle(os.getcwd()+'/pri_result_01.pkl')

In [33]:
test[test.id=='f2177b2e30ca1139633c528b5b019cdc0e75876f']

Unnamed: 0,id,classification_predictions,regression_predictions,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30
9976,f2177b2e30ca1139633c528b5b019cdc0e75876f,1,1,0.000499,0.96393,"[0.0, 0.24020775, 0.28247565, 0.0]","[0.0, 0.00046554767, 9.797556e-05, 0.0]","[0.0, 0.14355287, 0.14812815, 0.0]"


In [27]:
pri_final_result = pd.read_csv(os.getcwd()+'/pri_result_04.csv')

In [35]:
pri_final_result[pri_final_result.classification_predictions!=pri_final_result.classification_predictions]

Unnamed: 0,level_0,index,id,classification_predictions,regression_predictions,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30,prob
2794,11976,976,f2177b2e30ca1139633c528b5b019cdc0e75876f,,,0.000499,0.96393,[0. 0.24020775 0.28247565 0. ],[0.0000000e+00 4.6554767e-04 9.7975557e-05 0.0...,[0. 0.14355287 0.14812815 0. ],
8403,1121,121,05e894a3bab0d9af731a41f63bfbef3383fb6e99,,,0.0,0.0,[0.0000000e+00 2.5836995e-38 0.0000000e+00 0.0...,[0.0000000e+00 1.0961283e-09 3.5154574e-07 0.0...,[0. 0.13178162 0.00021578 0. ],
13012,15034,34,466153d245485913af9445471bb0f41475381a29,,,0.999924,0.0,[0.0000000e+00 1.7359059e-14 1.8519312e-17 0.0...,[0. 0.00304862 0.0012697 0. ],[0.0000000e+00 9.0636146e-05 9.1687747e-05 0.0...,


In [24]:
pd.read_csv(os.getcwd()+'/pub_result_04.csv')

Unnamed: 0,id,classification_predictions,regression_predictions,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30,prob
0,3ea64e9c143efc1b3752cb10db509674fe594759,0.0,1.0,0.006383,0.905503,[9.9090487e-01 0.0000000e+00 0.0000000e+00 5.1...,[9.999449e-01 0.000000e+00 0.000000e+00 7.5118...,[0.5652979 0. 0. 0.18367599],0.0
1,1e08f1e9a8c611a99034900d7e71254f3e0bb0e9,0.0,1.0,0.006621,0.832935,[9.8904479e-01 0.0000000e+00 0.0000000e+00 1.0...,[9.9991977e-01 0.0000000e+00 0.0000000e+00 1.5...,[0.563523 0. 0. 0.1799919],0.0
2,5c1a295acf4240f8a73fffd548928104b5a85eba,0.0,1.0,0.003213,0.903663,[9.8856539e-01 0.0000000e+00 0.0000000e+00 1.3...,[9.9991012e-01 0.0000000e+00 0.0000000e+00 1.8...,[0.5740864 0. 0. 0.17449906],0.0
3,35e47bc9b33ddf2efe63d450a14bbf70601293dd,0.0,1.0,0.003466,0.890815,[9.8788536e-01 0.0000000e+00 0.0000000e+00 9.7...,[9.9991429e-01 0.0000000e+00 0.0000000e+00 1.5...,[0.57575536 0. 0. 0.17426303],0.0
4,1464c83d09e74799f639cd1b100cc048941b018d,0.0,1.0,0.004539,0.903993,[9.867944e-01 0.000000e+00 0.000000e+00 3.5434...,[9.999455e-01 0.000000e+00 0.000000e+00 7.2493...,[0.55414695 0. 0. 0.19072166],0.0
...,...,...,...,...,...,...,...,...,...
1497,f0659296f230c9423909a7c1972f91bb098978be,1.0,30.0,1.000000,0.000000,[1. 0. 0. 0.],[1.000000e+00 0.000000e+00 0.000000e+00 4.4732...,[6.4160101e-12 0.0000000e+00 0.0000000e+00 9.9...,1.0
1498,4d6b503630ccde9c93e951cd76b28ee4b185b6b9,1.0,30.0,1.000000,0.000000,[1.00000e+00 0.00000e+00 0.00000e+00 8.31186e-30],[1.0000000e+00 0.0000000e+00 0.0000000e+00 3.8...,[2.9164843e-12 0.0000000e+00 0.0000000e+00 1.0...,1.0
1499,603e53ad0bd1f9a664de64a032cff45c68efd16a,1.0,30.0,1.000000,0.000000,[1.0000000e+00 0.0000000e+00 0.0000000e+00 2.2...,[1.0000000e+00 0.0000000e+00 0.0000000e+00 2.5...,[4.193762e-10 0.000000e+00 0.000000e+00 1.0000...,1.0
1500,9325382a5241de960ed1872f1b6d9ab60f6f9236,1.0,30.0,1.000000,0.000000,[1.000000e+00 0.000000e+00 0.000000e+00 9.5774...,[1.000000e+00 0.000000e+00 0.000000e+00 8.0739...,[6.678762e-09 0.000000e+00 0.000000e+00 1.0000...,1.0
