# Dependecies

In [1]:
import os 
import numpy as np
import pandas as pd

import lightgbm as lgb

import keras 
from keras.layers import *
from keras.optimizers import *
from keras.models import Model

from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold

Using TensorFlow backend.


In [2]:
DIR  = "data/"
SEED = 1881

if not os.path.isdir("models/"):
    os.makedirs("models")
    
print(os.listdir(DIR))

['submission_format.csv', 'test_values.csv', 'train_labels.csv', 'train_values.csv']


In [3]:
train_x = pd.read_csv(DIR+"train_values.csv")
train_y = pd.read_csv(DIR+"train_labels.csv")
test_x  = pd.read_csv(DIR+"test_values.csv")
sub_csv = pd.read_csv(DIR+"submission_format.csv")

# Geographic Location ID Embedding w/ Autoencoder

In [4]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]])))

In [5]:
geo3.shape

(347469, 11861)

In [6]:
def NET():
    inp = Input((geo3.shape[1],))
    i1 = Dense(16, name="intermediate")(inp)
    x2 = Dense(geo2.shape[1], activation='sigmoid')(i1)
    x1 = Dense(geo1.shape[1], activation='sigmoid')(i1)

    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [7]:
model = NET()
model.fit(geo3, [geo2, geo1], batch_size=128, epochs=10, verbose=2)
model.save("geo_embed.h5")

Epoch 1/10
 - 86s - loss: 0.2054 - dense_1_loss: 0.0531 - dense_2_loss: 0.1524
Epoch 2/10
 - 76s - loss: 0.0542 - dense_1_loss: 0.0052 - dense_2_loss: 0.0491
Epoch 3/10
 - 76s - loss: 0.0149 - dense_1_loss: 0.0042 - dense_2_loss: 0.0107
Epoch 4/10
 - 77s - loss: 0.0067 - dense_1_loss: 0.0035 - dense_2_loss: 0.0032
Epoch 5/10
 - 75s - loss: 0.0043 - dense_1_loss: 0.0030 - dense_2_loss: 0.0013
Epoch 6/10
 - 75s - loss: 0.0030 - dense_1_loss: 0.0024 - dense_2_loss: 6.2550e-04
Epoch 7/10
 - 75s - loss: 0.0020 - dense_1_loss: 0.0016 - dense_2_loss: 3.2531e-04
Epoch 8/10
 - 76s - loss: 0.0013 - dense_1_loss: 0.0011 - dense_2_loss: 1.7523e-04
Epoch 9/10
 - 75s - loss: 8.3187e-04 - dense_1_loss: 7.3584e-04 - dense_2_loss: 9.6034e-05
Epoch 10/10
 - 77s - loss: 5.6757e-04 - dense_1_loss: 5.1422e-04 - dense_2_loss: 5.3342e-05


In [7]:
# Load GEO-Embed Model
model = NET()
model.load_weights("geo_embed.h5")

Instructions for updating:
Colocations handled automatically by placer.


In [8]:
# "Extract Intermediate Layer" Function
from keras import backend as K

get_int_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

In [9]:
# Extract GEO-Embeds for all train data points.
# Then assign with train_data

out = []
for dat in geo3[:260601]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

train_data = pd.get_dummies(train_x.copy())
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
train_data = train_data.assign(geo_feat1=out[:,0],
                               geo_feat2=out[:,1],
                               geo_feat3=out[:,2],  
                               geo_feat4=out[:,3],
                               geo_feat5=out[:,4],    
                               geo_feat6=out[:,5],
                               geo_feat7=out[:,6],
                               geo_feat8=out[:,7],
                               geo_feat9=out[:,8],
                               geo_feat10=out[:,9],
                               geo_feat11=out[:,10],
                               geo_feat12=out[:,11],
                               geo_feat13=out[:,12],
                               geo_feat14=out[:,13],
                               geo_feat15=out[:,14],           
                               geo_feat16=out[:,15])

In [10]:
train_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,802906,2,30,6,5,1,1,0,0,0,...,-1.498751,-0.427753,1.313747,1.043078,2.243259,-0.485953,2.259708,1.000786,1.877474,0.547761
1,28830,2,10,8,7,0,1,0,0,0,...,0.87637,0.798046,1.537147,1.234688,0.813855,-1.614154,-0.076917,0.12531,1.477581,-1.249146
2,94947,2,10,5,5,0,1,0,0,0,...,-0.147367,1.463318,1.047525,0.996409,-0.10748,-1.332191,2.034211,2.69603,-0.663659,-1.964653
3,590882,2,10,6,5,0,1,0,0,0,...,-0.893961,0.917251,1.217506,0.874881,-0.263543,-1.850625,1.617405,-0.379536,-0.80882,-1.665377
4,201944,3,30,8,9,1,0,0,0,0,...,-0.548861,2.254541,0.001186,1.148989,1.442763,1.077754,0.544689,2.298833,2.032666,-2.213592


In [11]:
train_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [12]:
# Extract GEO-Embeds for all test data points.
# Then assign with test_data

out = []
for dat in geo3[260601:]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

test_data = pd.get_dummies(test_x.copy())
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.assign(geo_feat1=out[:,0],
                             geo_feat2=out[:,1],
                             geo_feat3=out[:,2],  
                             geo_feat4=out[:,3],
                             geo_feat5=out[:,4],    
                             geo_feat6=out[:,5],
                             geo_feat7=out[:,6],
                             geo_feat8=out[:,7],
                             geo_feat9=out[:,8],
                             geo_feat10=out[:,9],
                             geo_feat11=out[:,10],
                             geo_feat12=out[:,11],
                             geo_feat13=out[:,12],
                             geo_feat14=out[:,13],
                             geo_feat15=out[:,14],           
                             geo_feat16=out[:,15])

In [13]:
test_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,300051,3,20,7,6,0,1,0,0,0,...,0.674943,1.233606,1.074826,0.948696,1.789101,-0.690246,1.215929,0.641817,1.489536,-1.748435
1,99355,2,25,13,5,0,1,0,0,0,...,-0.633836,0.383985,0.636826,0.662928,1.637369,-0.900778,1.354285,0.221195,0.662908,0.386138
2,890251,2,5,4,5,0,1,0,0,0,...,-0.570019,0.412793,1.512381,-0.30235,0.350699,-1.007432,1.000211,0.475425,-0.184,-0.991295
3,745817,1,0,19,3,0,0,0,0,0,...,-1.826422,2.925866,2.126571,1.424021,2.839799,-1.503001,1.168797,2.822999,0.984964,0.84019
4,421793,3,15,8,7,0,1,0,0,0,...,-0.622701,1.714811,1.518751,0.280875,1.983836,-0.291633,1.130178,0.390867,1.355002,-1.796739


In [14]:
test_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [15]:
def threshold_arr(array):
    # Get major confidence-scored predicted value.
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

# LightGBM Training

In [16]:
y = np.array(train_y["damage_grade"])-1

df = train_data.drop(["building_id"], axis=1)
x = np.array(df)

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(kf.split(x)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "multi_error",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }

    x_train, x_val, y_train, y_val= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data = lgb.Dataset(x_train, label=y_train)
    val_data   = lgb.Dataset(x_val, label=y_val)

    lgb_clf = lgb.train(lgb_params,
                        train_data,
                        20000,
                        valid_sets = [val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000)

    y_pred = lgb_clf.predict(x_val)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_val)), threshold_arr(y_pred), average='micro'))
    lgb_clf.save_model(f'models/model{ix}.txt')

Training until validation scores don't improve for 3000 rounds.
[1000]	valid_0's multi_error: 0.248326
[2000]	valid_0's multi_error: 0.246714
[3000]	valid_0's multi_error: 0.247712
[4000]	valid_0's multi_error: 0.248077
[5000]	valid_0's multi_error: 0.249535
Early stopping, best iteration is:
[2156]	valid_0's multi_error: 0.246292
F1-MICRO SCORE:  0.7537077185779244
Training until validation scores don't improve for 3000 rounds.
[1000]	valid_0's multi_error: 0.252475
[2000]	valid_0's multi_error: 0.250787
[3000]	valid_0's multi_error: 0.25023
[4000]	valid_0's multi_error: 0.25117
[5000]	valid_0's multi_error: 0.251516
Early stopping, best iteration is:
[2823]	valid_0's multi_error: 0.249501
F1-MICRO SCORE:  0.7504988488104375
Training until validation scores don't improve for 3000 rounds.
[1000]	valid_0's multi_error: 0.250979
[2000]	valid_0's multi_error: 0.249213
[3000]	valid_0's multi_error: 0.249367
[4000]	valid_0's multi_error: 0.250902
[5000]	valid_0's multi_error: 0.252168
Early

# Create Submission File

In [17]:
# Load all LightGB Models and concatenate.
models = []
for i in range(5):
    model = lgb.Booster(model_file=f'models/model{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8095824651478697
F1-MICRO SCORE:  0.821328390911777
F1-MICRO SCORE:  0.8112324971891894
F1-MICRO SCORE:  0.8103422473436402
F1-MICRO SCORE:  0.8036270006638501


In [18]:
def ensemble(models, x):
    # Ensemble K-Fold CV models with adding all confidence score by class.
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

In [19]:
df = test_data.drop(["building_id"], axis=1)
x = np.array(df)

In [21]:
y_pred = ensemble(models, x)
y_pred = y_pred.argmax(axis=1)+1

In [22]:
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)