In [86]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [87]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [88]:
%load_ext tensorboard
train_data = pd.read_csv("train.csv",index_col = 'id')
test_data = pd.read_csv("test.csv",index_col = 'id')

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [89]:
train_data = train_data.fillna(train_data.mean())
test_data = test_data.fillna(train_data.mean())

In [90]:
Y_train = train_data['loss'].copy()
X_train = train_data.copy().drop('loss', axis = 1)

In [91]:
X_train[X_train.columns] = X_train[X_train.columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
test_data[test_data.columns] = test_data[test_data.columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [92]:
model = keras.Sequential([
        layers.Input(shape = (100,)),
        #layers.Embedding(400, 8, embeddings_regularizer='l2', input_length=75),
        #layers.Conv1D(32, 3, activation='relu', padding='same', input_shape=(100,1)),
        #layers.Flatten(),
        #layers.Dropout(0.2),
        #layers.LayerNormalization(),
        layers.Dense(units = 64, activation ='sigmoid'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(1, activation = 'relu'),
    ])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 64)                6464      
_________________________________________________________________
batch_normalization_9 (Batch (None, 64)                256       
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 6,785
Trainable params: 6,657
Non-trainable params: 128
_________________________________________________________________


In [93]:
X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 100, stratify = Y_train)

In [94]:
early_stopping = callbacks.EarlyStopping(
    monitor="root_mean_squared_error",
    patience=20,
    min_delta=0.0001,
    restore_best_weights=True,
)

In [95]:
logdir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

In [96]:
#model.compile(loss='mse', optimizer = keras.optimizers.Adam(learning_rate=0.0005),  metrics=[tf.keras.metrics.RootMeanSquaredError()])
#history = model.fit(X_train_split, Y_train_split,
#          batch_size = 256, epochs = 100,
#          validation_data=(X_val_split, Y_val_split),
#          callbacks=[early_stopping,tensorboard_callback],
#           )
#score = model.evaluate(X_val_split, Y_val_split, verbose = 0)
#print('Test loss: {}'.format(score[0]))

kfold = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kfold.split(X_train, Y_train):
    model.compile(loss='mse', optimizer = keras.optimizers.Adam(learning_rate=0.0005), metrics=[tf.keras.metrics.RootMeanSquaredError()])
    history = model.fit(X_train.iloc[train_index], Y_train.iloc[train_index],
          batch_size = 256, epochs = 40,
          validation_data=(X_train.iloc[test_index], Y_train.iloc[test_index]),
          callbacks=[early_stopping,tensorboard_callback],
           )
    score = model.evaluate(X_train.iloc[test_index], Y_train.iloc[test_index], verbose = 0)
    print('Test loss: {}'.format(score))
    print('Test loss sqrt: {}'.format(np.sqrt(score)))





Train on 200000 samples, validate on 50000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




Test loss: [62.35996030883789, 7.8968353]
Test loss sqrt: [7.8968323  2.81013084]




Train on 200000 samples, validate on 50000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




Test loss: [61.5048365246582, 7.8424993]
Test loss sqrt: [7.84250193 2.80044626]




Train on 200000 samples, validate on 50000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




Test loss: [62.79262077148437, 7.924176]
Test loss sqrt: [7.9241795  2.81499134]




Train on 200000 samples, validate on 50000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




Test loss: [61.937984588623046, 7.8700686]
Test loss sqrt: [7.87006891 2.80536425]




Train on 200000 samples, validate on 50000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




Test loss: [61.69443813720703, 7.854576]
Test loss sqrt: [7.85458071 2.80260167]


In [97]:
predicts = model.predict(test_data)





In [98]:
output = pd.DataFrame(predicts, columns = ['loss'])
output['id'] = test_data.index
output.to_csv('submission.csv', index=False)
