In [1]:
# General data manipulation
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import os
from os import listdir
from os.path import isfile, join 
import sys
import time
from PIL import Image # pip install pillow
from keras.utils import np_utils
os.environ['KMP_DUPLICATE_LIB_OK']='True'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import keras.backend as K
import tensorflow as tf

def categorical_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha*((1-p)^gamma)*log(p)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediction value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate cross entropy
        cross_entropy = -y_true*K.log(y_pred)
        # Calculate weight that consists of  modulating factor and weighting factor
        weight = alpha * y_true * K.pow((1-y_pred), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss
    
    return focal_loss

In [3]:
# convert PIL.Image object to numpy.Array, for training
def img2arr(img):
    return np.asarray(img.getdata(), dtype=np.uint8).reshape(img.height, img.width, -1)

def read_train_pickle(Type, Energy, DIR = os.getcwd() + '/idao_dataset/'):
    return pd.read_pickle(DIR + 'train_pickle/' + Type + str(Energy) + '.pkl')

In [4]:
from keras.models import load_model
def predict(img_array, ids, isTrain):
    isTrain_array = np.array([isTrain]*len(img_array)).reshape((len(img_array), 1))
    isHigh = is_high_model.predict(img_array)
    isLow = is_low_model.predict(img_array)
    energy_1vs3 = energy_1vs3_model.predict([img_array, isTrain_array])
    energy_6vs10 = energy_6vs10_model.predict([img_array, isTrain_array])
    energy_20vs30 = energy_20vs30_model.predict([img_array, isTrain_array])
    if isTrain:
        ignore_i = [1,2]
    else:
        ignore_i = [0,3]
        
    energy_1vs3[:,ignore_i[0]] = 0.0
    energy_1vs3[:,ignore_i[1]] = 0.0
    energy_6vs10[:,ignore_i[0]] = 0.0
    energy_6vs10[:,ignore_i[1]] = 0.0
    energy_20vs30[:,ignore_i[0]] = 0.0
    energy_20vs30[:,ignore_i[1]] = 0.0
    
    energy_1vs3_i = np.argmax(energy_1vs3, axis=-1)
    energy_6vs10_i = np.argmax(energy_6vs10, axis=-1)
    energy_20vs30_i = np.argmax(energy_20vs30, axis=-1)
    
    pred = pd.DataFrame()
    for i in range(len(img_array)):
        if isHigh[i].round():
            cur_i = energy_20vs30_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*10+20]
        elif isLow[i].round():
            cur_i = energy_1vs3_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*2+1]
        else:
            cur_i = energy_6vs10_i[i]
            Type,Energy = [int(cur_i >=2), int(cur_i%2 == 1)*4+6]
        pred = pred.append(pd.DataFrame({'id': ids[i], 'classification_predictions': Type, 'regression_predictions': Energy,
                                         'isHigh': isHigh[i],
                                         'isLow': isLow[i],
                                         'energy_1vs3': [energy_1vs3[i]],
                                         'energy_6vs10': [energy_6vs10[i]],
                                         'energy_20vs30': [energy_20vs30[i]]}, index=[i]))
            
    return pred

In [5]:
from keras.models import load_model
is_high_model = load_model('Joe_model_high_01.h5')
is_low_model = load_model('Joe_model_low_01.h5')
energy_1vs3_model = load_model('Joe_model_Energy_1vs3_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=5.0,alpha=1.0)})
energy_6vs10_model =  load_model('Joe_model_Energy_6vs10_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=4.0,alpha=1.0)})
energy_20vs30_model = load_model('Joe_model_Energy_20vs30_01.h5', custom_objects={'focal_loss': categorical_focal_loss(gamma=3.0,alpha=1.0)})


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [6]:
import os
pri_DIR = os.getcwd() + '/idao_dataset/private_test/'

pri_file_list = os.listdir(pri_DIR)
pri_file_list = [x for x in pri_file_list if x[-3:]=='png']
pri_ids = [x.replace('.png', '') for x in pri_file_list]
pri_file_list = [pri_DIR+x for x in pri_file_list]
pri_file_list[:5]

['/Users/joefu/prog/projects/IDAO_2021/idao_dataset/private_test/f2d1ff8cb985245a7bf00ee938aa5429ba064c60.png',
 '/Users/joefu/prog/projects/IDAO_2021/idao_dataset/private_test/10873b3b1ea75a70aef36bf6f9160b711a9cd804.png',
 '/Users/joefu/prog/projects/IDAO_2021/idao_dataset/private_test/9964b0c21fa922dea5dc6a9779e3002613b2ef6b.png',
 '/Users/joefu/prog/projects/IDAO_2021/idao_dataset/private_test/cf4e41a1db52317979265764b9f384461785fb8a.png',
 '/Users/joefu/prog/projects/IDAO_2021/idao_dataset/private_test/0f4499dd0bee627230bdea6e340a0634f46c03f9.png']

In [7]:
from scipy.ndimage import *
# N = 1500
k = 64
pri_result = pd.DataFrame()

In [8]:
len(pri_file_list)

15058

In [9]:
# for batch in range(len(pri_file_list)//1000+1):
#     i,j = (batch*1000,min((batch+1)*1000, len(pri_file_list)))
for batch in range(len(pri_file_list)//1000+1):
    i,j = (batch*1000,min((batch+1)*1000, len(pri_file_list)))
    print(i,j)
    img_arr_list = []
    for f in pri_file_list[i:j]:
        img = Image.open(f).convert('LA')
        img = img2arr(img)[:,:,0]
        img = img[(288-k):(288+k), (288-k):(288+k)].astype(float)
        img -= np.median(img, axis=0)
        img = grey_closing(gaussian_gradient_magnitude(img,5), 9)
        img_arr_list.append(img)
    pri_img_array = np.array(img_arr_list).reshape(j-i,128,128,1)
    pri_result = pri_result.append(predict(pri_img_array, pri_ids[i:j], 0))

0 1000
1000 2000


KeyboardInterrupt: 

In [82]:
pri_result.to_pickle(os.getcwd()+'/pri_result_01.pkl')

In [10]:
pri_result = pd.read_pickle(os.getcwd()+'/pri_result_01.pkl').reset_index()

In [11]:
from collections import defaultdict
out_sample = {'NR': [3,10,30], 'ER': [1,6,20]}
cnt = defaultdict(int)
for i in range(len(pri_result)):
    r = pri_result.iloc[i]
    pred_type, pred_energy = [None, None]
    max_prob = -1
    for Type in out_sample:
        for Energy in out_sample[Type]:
            j = (Type=='ER')*2 + int(Energy in [3,10,30])
            if Energy >= 20:
                cur_prob = r.energy_20vs30[j]*r.isHigh
            elif Energy <= 3:
                cur_prob = r.energy_1vs3[j]*r.isLow
            else:
                cur_prob = r.energy_6vs10[j]*(1-r.isHigh)*(1-r.isLow)
            if int(Type=='ER') == r.classification_predictions and Energy == r.regression_predictions:
                pred_prob = cur_prob
            if cur_prob > max_prob:
                max_prob = cur_prob
                pred_type, pred_energy = [int(Type=='ER'), Energy]
    if max_prob > pred_prob*10:
        pri_result.loc[i,'classification_predictions'] = pred_type
        pri_result.loc[i,'regression_predictions'] = pred_energy
#         print(i,pred_prob,max_prob)
        cnt[(r.classification_predictions,r.regression_predictions,'to',pred_type,pred_energy)] += 1

In [12]:
cnt

defaultdict(int,
            {(0, 10, 'to', 0, 3): 347,
             (0, 10, 'to', 1, 1): 30,
             (0, 3, 'to', 1, 6): 291,
             (0, 3, 'to', 0, 10): 168,
             (1, 6, 'to', 0, 30): 109,
             (1, 6, 'to', 1, 20): 23,
             (0, 10, 'to', 1, 20): 12})

In [13]:
pri_result.groupby(['classification_predictions','regression_predictions']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,id,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30
classification_predictions,regression_predictions,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3,2229,2229,2229,2229,2229,2229,2229
0,10,3374,3374,3374,3374,3374,3374,3374
0,30,2987,2987,2987,2987,2987,2987,2987
1,1,2794,2794,2794,2794,2794,2794,2794
1,6,1628,1628,1628,1628,1628,1628,1628
1,20,2046,2046,2046,2046,2046,2046,2046


In [14]:
from scipy.special import softmax
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def avg(x):
    return sum(x)/len(x)
def norm_sig(x):
    return sigmoid((x - avg(x))/np.std(x))

In [54]:
pri_low[pri_low.id=='f2177b2e30ca1139633c528b5b019cdc0e75876f']

Unnamed: 0,index,id,classification_predictions,regression_predictions,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30,prob
2794,9976,f2177b2e30ca1139633c528b5b019cdc0e75876f,0.5,2.0,0.000499,0.96393,"[0.0, 0.24020775, 0.28247565, 0.0]","[0.0, 0.00046554767, 9.797556e-05, 0.0]","[0.0, 0.14355287, 0.14812815, 0.0]",0.5


In [56]:
pri_high = pri_result[pri_result.regression_predictions>=20].copy()
pri_high['prob'] = [softmax(x[[1,2]])[1] for x in pri_high.energy_20vs30]
pri_high = pri_high.sort_values('prob').reset_index(drop=True)
i = sum(pri_high.prob.round())
j = len(pri_high) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pri_high.loc[:i+1,'prob'] = 0
pri_high.loc[j:,'prob'] = 1
pri_high.loc[i+2:j,'prob'] = norm_sig(pri_high.prob[i+2:j])
print(len([x for x in pri_high.prob if x>0 and x<1]))
pri_high.loc[pri_high.prob!=pri_high.prob, 'prob'] = 0.5
pri_high['classification_predictions'] = pri_high.prob
pri_high['regression_predictions'] = 30-pri_high.prob*10

2046 2987
939


In [57]:
pri_low = pri_result[pri_result.regression_predictions<=3].copy()
pri_low['prob'] = [softmax(x[[1,2]])[1] for x in pri_low.energy_1vs3]
pri_low = pri_low.sort_values('prob').reset_index(drop=True)
i = sum(pri_low.prob.round())
j = len(pri_low) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pri_low.loc[:i+1,'prob'] = 0
pri_low.loc[j:,'prob'] = 1

2229 2794


In [58]:
pri_low.loc[i+2:j,'prob'] = norm_sig(pri_low.prob[i+2:j])
print(len([x for x in pri_low.prob if x>0 and x<1]))

563


In [59]:

pri_low.loc[pri_low.prob!=pri_low.prob, 'prob'] = 0.5
pri_low['classification_predictions'] = pri_low.prob
pri_low['regression_predictions'] = 3-pri_low.prob*2

In [60]:
pri_mid = pri_result[(pri_result.regression_predictions >= 6)&(pri_result.regression_predictions <= 10)].copy()
pri_mid['prob'] = [softmax(x[[1,2]])[1] for x in pri_mid.energy_6vs10]
pri_mid = pri_mid.sort_values('prob').reset_index(drop=True)
i = sum(pri_mid.prob.round())
j = len(pri_mid) - i
i,j = [int(min(i,j)),int(max(i,j))]
print(i,j)
pri_mid.loc[:i+1,'prob'] = 0
pri_mid.loc[j:,'prob'] = 1
pri_mid.loc[i+1:j,'prob'] = norm_sig(pri_mid.prob[i+2:j])
print(len([x for x in pri_mid.prob if x>0 and x<1]))

pri_mid.loc[pri_mid.prob!=pri_mid.prob, 'prob'] = 0.5
pri_mid['classification_predictions'] = pri_mid.prob
pri_mid['regression_predictions'] = 10-pri_mid.prob*4

1622 3380
1756


In [61]:
pri_final_result = pd.concat([pri_low,pri_mid,pri_high])

In [63]:
pri_final_result[pri_final_result.classification_predictions!=pri_final_result.classification_predictions]

Unnamed: 0,index,id,classification_predictions,regression_predictions,isHigh,isLow,energy_1vs3,energy_6vs10,energy_20vs30,prob


In [35]:
print(len(pri_final_result),len(pri_result))

15058 15058


In [267]:
pri_final_result.to_csv(os.getcwd()+'/pri_result_04.csv', index=False)

In [64]:
pub_final_result = pd.read_csv(os.getcwd()+'/pub_result_04.csv')

In [280]:
pri_final_result = pd.read_csv(os.getcwd()+'/pri_result_04.csv')

In [65]:
submit_cols = ['id', 'classification_predictions', 'regression_predictions']
submit = pub_final_result[submit_cols].append(pri_final_result[submit_cols])
submit

Unnamed: 0,id,classification_predictions,regression_predictions
0,3ea64e9c143efc1b3752cb10db509674fe594759,0.0,1.0
1,1e08f1e9a8c611a99034900d7e71254f3e0bb0e9,0.0,1.0
2,5c1a295acf4240f8a73fffd548928104b5a85eba,0.0,1.0
3,35e47bc9b33ddf2efe63d450a14bbf70601293dd,0.0,1.0
4,1464c83d09e74799f639cd1b100cc048941b018d,0.0,1.0
...,...,...,...
5028,6e776ad836b7a7ce9d98e877391ad369a62142af,1.0,20.0
5029,50eebf17e8e2e1c42d0ee5cae04c741f975a8ebb,1.0,20.0
5030,3967ed4b7f04923d445c451078344759c9a76368,1.0,20.0
5031,ba73236fa844c3848fb79f45568751cd4ebd7ca8,1.0,20.0


In [66]:
submit.to_csv(os.getcwd()+'/submit_04.csv', index=False)

In [40]:
pri_final_result.prob.sum()

7512.459705381897

In [41]:
pd.read_csv(os.getcwd()+'/pri_result_04.csv').prob.sum()

7512.459705381897

In [72]:
sorted(submit.classification_predictions.unique())[::]

[0.0,
 0.13281118582834356,
 0.133838883887135,
 0.1359332305520803,
 0.13616345323967202,
 0.1368985951945187,
 0.13703089669467527,
 0.13704980568660083,
 0.1377011377034613,
 0.13825740861128505,
 0.1386722679923827,
 0.13907859856267443,
 0.13979808740788036,
 0.14013732265401577,
 0.1407356632869001,
 0.1408275330258151,
 0.141656618521731,
 0.1418730419048927,
 0.14386036659897938,
 0.14422507381659738,
 0.145373465123695,
 0.1458259795753531,
 0.14622974920479104,
 0.14737377276640717,
 0.1494704633526979,
 0.1505714490230152,
 0.15174854795759096,
 0.1532259966073246,
 0.15344135772975276,
 0.1568971237198713,
 0.15992665930559083,
 0.16324894706320167,
 0.16371633974370742,
 0.16527933162176628,
 0.1654172475465026,
 0.16598643447554617,
 0.1669459760994378,
 0.16773688695216066,
 0.16806917809079328,
 0.16971035123382844,
 0.17052581255068106,
 0.17274510189216408,
 0.17278509197745598,
 0.17393937653015998,
 0.17785627875421237,
 0.1781399395398963,
 0.17814020515956597,
 0.