In [1]:
import pandas as pd
import numpy as np
import dotenv
from scipy import sparse
import pickle
import os

dotenv.load_dotenv('.env')

True

In [2]:
!mkdir data
!gsutil rsync gs://{os.environ['GCP_BUCKET']}/data data
!kaggle competitions download -f sample_submission.csv --path ./data

Building synchronization state...
Starting synchronization...
Copying gs://kaggle-195720-avito-demand-prediction/data/sample_submission.csv...
Copying gs://kaggle-195720-avito-demand-prediction/data/test.csv.zip...
Copying gs://kaggle-195720-avito-demand-prediction/data/test_description_lsa.snappy.parquet...
Copying gs://kaggle-195720-avito-demand-prediction/data/test_description_tfidf.npz...
| [4 files][355.8 MiB/355.8 MiB]   57.0 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m -o ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://kaggle-195720-avito-demand-prediction/data/test_prep.snappy.parquet...
Copying gs://kaggle-195720-avito-demand-prediction/data/test_title_lsa.snappy.parquet...
Copying gs://kaggle-195720-avito-demand-prediction/data/test_title_tfidf.npz...
Copying gs://kaggl

Copying gs://kaggle-195720-avito-demand-prediction/data/train_title_lsa.snappy.parquet...
Copying gs://kaggle-195720-avito-demand-prediction/data/train_title_tfidf.npz...
| [14 files][  2.3 GiB/  2.3 GiB]   55.8 MiB/s                                  
Operation completed over 14 objects/2.3 GiB.                                     
Using competition: avito-demand-prediction
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
cat_cols = [
    'image_top_1', 
    'region_code', 
    'city_code', 
    'parent_category_name_code', 
    'category_name_code', 
    'user_type_code', 
    'param_1_code', 
    'param_2_code', 
    'param_3_code',
    'user_id_code',
    'item_seq_number_code',
    'activation_date_weekday_code',
    'activation_date_month_code',
    'activation_date_day_code'
]
cont_cols = ['price_std', 'title_length', 'title_space_count', 'description_length', 'description_space_count']
sparse_cols = 


def load_data(t):
    df = pd.read_parquet('./data/%s_prep.snappy.parquet'%t, columns=cat_cols+cont_cols)

    prep = sparse.csr_matrix(df.values)
    
    title_tfidf = sparse.load_npz('./data/%s_title_tfidf.npz'%t)
    description_tfidf = sparse.load_npz('./data/%s_description_tfidf.npz'%t)
    tfidf_features = pickle.load(open('./data/tfidf_feature_names.pl', 'rb'))

    feature_names = df.columns.tolist() + ['title_%s'%f for f in tfidf_features] + ['description_%s'%f for f in tfidf_features]
    features = sparse.hstack([prep, title_tfidf, description_tfidf])

    return features, feature_names

In [4]:
# train = pd.read_parquet('./data/train_prep.snappy.parquet')
# test = pd.read_parquet('./data/test_prep.snappy.parquet')
# submission = pd.read_csv('./data/sample_submission.csv')
X_train, X_cols = load_data('train')
y_train, y_cols = pd.read_parquet('./data/train_prep.snappy.parquet', columns=['deal_probability'])['deal_probability'], ['deal_probability']

In [6]:
emb_size = [
    8,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    8,
    8,
    2,
    2,
    2
]

In [5]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation, Reshape, Add, Average
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate, Dot
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2, l1
from keras.constraints import non_neg, unit_norm
import keras.backend as K
from keras.metrics import mse
import tensorflow as tf
from keras.wrappers.scikit_learn import KerasRegressor

DROPOUT = 0.2
REGULARIZATION = 1e-5
EMB_SIZE = 1

def rmse(y_true, y_pred):
    # bug when K.sqrt(mse(y_true, y_pred))
    return tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_true, y_pred))))
  
def deal_cat_pred(deal_cat, hidden, zeros, ones):
  if deal_cat == 0:
      return zeros
  elif deal_cat == 1:
    output = Dense(8, activation="relu")(hidden)
    output = Dense(8, activation="relu")(output)
    return Dense(1, activation="sigmoid")(output)
  elif deal_cat == 2:
    output = Dense(8, activation="relu")(hidden)
    output = Dense(8, activation="relu")(output)
    return Dense(1, activation="sigmoid")(output)
  else:
    return ones
  

def build_model():
#     zeros = Input(shape=(1, ))
#     ones = Input(shape=(1, ))
    cat_inputs = []
    cat_embs = []
    for cat, s in zip(cat_cols, emb_size):
      inp = Input(shape=(1,), name=cat)
      emb = Embedding(train[cat].max()+1, s, embeddings_regularizer=l2(REGULARIZATION))(inp)
      emb = Flatten()(emb)
      emb = Activation('tanh')(emb)
      
      cat_inputs.append(inp)
      cat_embs.append(emb)
    
    
    cont_inputs = []
    cont_embs = []
    for col in cont_cols:
      inp = Input(shape=(1,), name=col)
      emb = Dense(4,activation='tanh')(inp)
      
      cont_inputs.append(inp)
      cont_embs.append(emb)
      
    
    emb = Concatenate()(cat_embs + cont_embs)
    
    vec = Reshape((-1, 1))(emb)    
    outter = Flatten()(Dot(-1)([vec, vec]))
    # outter = Flatten()(Dot(-1)([vec, outter]))
    
    emb = Concatenate()([emb, outter])
    
#     preds = []
    
#     for i in range(10):
#       pred = Dropout(DROPOUT)(emb)
#       pred = Dense(1, activation='sigmoid')(pred)
#       preds.append(pred)
      
#     out = Average()(preds)

    emb = Dropout(DROPOUT)(emb)
    emb = Dense(64, activation='selu')(emb)
    emb = Dropout(DROPOUT)(emb)
    out = Dense(1, activation='sigmoid')(emb)
    
    model = Model(inputs=cat_inputs + cont_inputs, outputs=out)
    model.compile(loss=rmse,
                  optimizer='adam',
#                   optimizer='sgd',
#                   optimizer='adadelta',
                  metrics=[rmse])
    
    return model

model = build_model()
model.summary()

Using TensorFlow backend.


NameError: ignored

In [0]:
val_period = ((train['activation_date'] >= '2017-03-15') & (train['activation_date'] <= '2017-03-16')) |\
  ((train['activation_date'] >= '2017-03-22') & (train['activation_date'] <= '2017-03-23'))
train_period = ~val_period

BATCH_SIZE = 4096 * 2
EPOCHS = 20000
FILE_PATH = 'model.p5'

checkpoint = ModelCheckpoint(FILE_PATH, monitor='val_rmse', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=2)
callbacks_list = [checkpoint, early] #early

In [0]:
# from sklearn.model_selection import KFold, cross_val_score


# k_fold = KFold(n_splits=5, shuffle=True)
# model = KerasRegressor(build_model, batch_size=BATCH_SIZE)
# metrics = cross_val_score(
#     model, 
#     [train[col] for col in cat_cols] + [train['price_std']], 
#     train['deal_probability'], 
#     cv=k_fold,
#     n_jobs=1,
#     fit_params={
#         'validation_split': 0.2,
#         'shuffle': True,
#         'batch_size': BATCH_SIZE,
#         'epochs': EPOCHS,
#         'callbacks': callbacks_list
#     }
# )

In [30]:
# Full training
# train_inp = [train[train_period][col] for col in cat_cols] + [train[train_period][col] for col in cont_cols]
# val_inp = [train[val_period][col] for col in cat_cols] + [train[val_period][col] for col in cont_cols]

BATCH_SIZE = 4096 // 4

# model = KerasRegressor(build_model, batch_size=BATCH_SIZE)
# model.fit(train_inp, train[train_period]['deal_probability'], **{
#         'validation_data': (val_inp, train[val_period]['deal_probability']),
#         'shuffle': True,
#         'batch_size': BATCH_SIZE,
#         'epochs': EPOCHS,
#         'callbacks': callbacks_list
# })
model = KerasRegressor(build_model, batch_size=BATCH_SIZE)
model.fit([train[col] for col in cat_cols] + [train[col] for col in cont_cols], train['deal_probability'], **{
        'validation_split': 0.2,
        'shuffle': True,
        'batch_size': BATCH_SIZE,
        'epochs': EPOCHS,
        'callbacks': callbacks_list
})

Train on 1202739 samples, validate on 300685 samples
Epoch 1/20000
Epoch 2/20000
  57344/1202739 [>.............................] - ETA: 31s - loss: 0.2286 - rmse: 0.2236

Epoch 3/20000

Epoch 4/20000



<keras.callbacks.History at 0x7f10f0cf3f60>

In [31]:
# from keras.models import load_model
# model = load_model(FILE_PATH, custom_objects={'rmse': rmse})

metric = model.score([train[col] for col in cat_cols] + [train[col] for col in cont_cols], train['deal_probability'])



In [32]:
metric

-0.21996117967432383

In [0]:
SUBMISSION_FILE='baseline.csv'
SUBMISSION_MESSAGE='"Baseline %f"'%metric

test['deal_probability'] = model.predict(
    [test[col] for col in cat_cols] + [test[col] for col in cont_cols],
    batch_size=BATCH_SIZE
)
test[['item_id', 'deal_probability']].to_csv(SUBMISSION_FILE, index=False)

In [34]:
len(test['item_id']) == len(submission['item_id'])

True

In [35]:
!kaggle competitions submit -f '{SUBMISSION_FILE}' -m '{SUBMISSION_MESSAGE}'

Using competition: avito-demand-prediction
Successfully submitted to Avito Demand Prediction Challenge

In [50]:
!kaggle competitions submit

usage: kaggle competitions submit [-h] [-c COMPETITION] -f FILE_NAME -m
                                  MESSAGE [-q]
kaggle competitions submit: error: the following arguments are required: -f/--file, -m/--message


In [52]:
!echo "{SUBMISSION_FILE}"

baseline.csv


In [62]:
!kaggle competitions submit -m 'Baseline -0.22988' -f baseline.csv

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 11, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.6/dist-packages/kaggle/cli.py", line 48, in main
    out = args.func(**command_args)
TypeError: competition_submit_cli() got an unexpected keyword argument 'file_name'


In [63]:
!pip freeze | grep kaggle

kaggle==1.3.9


In [65]:
!kaggle --version

Kaggle API 1.3.8
