[View in Colaboratory](https://colab.research.google.com/github/Hoiy/kaggle-colab-env/blob/master/template.ipynb)

In [1]:
import pandas as pd
import dotenv
import os

dotenv.load_dotenv('.env')

True

In [5]:
!gsutil cp gs://{os.environ['GCP_BUCKET']}/train_prep.snappy.parquet ./data
!gsutil cp gs://{os.environ['GCP_BUCKET']}/test_prep.snappy.parquet ./data
!kaggle competitions download -f sample_submission.csv --path ./data

Copying gs://kaggle-195720-avito-demand-prediction/train_prep.snappy.parquet...
\ [1 files][431.3 MiB/431.3 MiB]                                                
Operation completed over 1 objects/431.3 MiB.                                    
Copying gs://kaggle-195720-avito-demand-prediction/test_prep.snappy.parquet...
/ [1 files][152.2 MiB/152.2 MiB]                                                
Operation completed over 1 objects/152.2 MiB.                                    


In [0]:
train = pd.read_parquet('./data/train_prep.snappy.parquet')
test = pd.read_parquet('./data/test_prep.snappy.parquet')
submission = pd.read_csv('./data/sample_submission.csv')

In [17]:
train.shape, test.shape

((1503424, 18), (508438, 17))

In [18]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm
import keras.backend as K
from keras.metrics import mse
import tensorflow as tf

def rmse(y_true, y_pred):
    # bug when K.sqrt(mse(y_true, y_pred))
    return tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_true, y_pred))))

def build_model():
    inp = Input(shape=(1, ))
    emb = Embedding(train.image_top_1.max()+1, 8)(inp)
    emb = Flatten()(emb)
    hidden = Dense(10, activation='relu')(emb)
    out = Dense(1, activation="sigmoid")(hidden)
    
    model = Model(inputs=inp, outputs=out)
    model.compile(loss=rmse,
                  optimizer='adam')
    return model

model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 1, 8)              24544     
_________________________________________________________________
flatten_3 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                90        
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 24,645
Trainable params: 24,645
Non-trainable params: 0
_________________________________________________________________


In [19]:
BATCH_SIZE = 4096
EPOCHS = 20000
FILE_PATH="best.hdf5"

checkpoint = ModelCheckpoint(FILE_PATH, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early

model = build_model()
model.fit(
    train['image_top_1'], 
    train['deal_probability'], 
    validation_split=0.2,
    shuffle=True, 
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS, 
    callbacks=callbacks_list)

Train on 1202739 samples, validate on 300685 samples
Epoch 1/20000
Epoch 2/20000
Epoch 3/20000
Epoch 4/20000
Epoch 5/20000
Epoch 6/20000
Epoch 7/20000
  69632/1202739 [>.............................] - ETA: 2s - loss: 0.2431

Epoch 8/20000
Epoch 9/20000
Epoch 10/20000
Epoch 11/20000
Epoch 12/20000
Epoch 13/20000

Epoch 14/20000
Epoch 15/20000
Epoch 16/20000
Epoch 17/20000
Epoch 18/20000
Epoch 19/20000

Epoch 20/20000
Epoch 21/20000
Epoch 22/20000
Epoch 23/20000
Epoch 24/20000


<keras.callbacks.History at 0x7fcb51e51630>

In [20]:
from keras.models import load_model
model = load_model(FILE_PATH, custom_objects={'rmse': rmse})

metric = model.evaluate(
    train['image_top_1'], 
    train['deal_probability'],
    batch_size=BATCH_SIZE
)
metric



In [0]:
SUBMISSION_FILE='baseline.csv'
SUBMISSION_MESSAGE='"Baseline %f"'%metric

submission['deal_probability'] = model.predict(
    pd.merge(submission, test[['item_id', 'image_top_1']], how='left', on='item_id')['image_top_1'],
    batch_size=BATCH_SIZE
)
submission.to_csv(SUBMISSION_FILE, index=False)

In [22]:
!kaggle competitions submit -f {SUBMISSION_FILE} -m {SUBMISSION_MESSAGE}

Using competition: avito-demand-prediction
Successfully submitted to Avito Demand Prediction Challenge