In [1]:
from __future__ import absolute_import

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/train.csv', dtype={'shop_id': np.int32, 'item_id': np.int32, 'item_cnt_day':np.int32})

In [3]:
# vocabularies
shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')
SHOPS_COUNT = len(shops)
ITEMS_COUNT = len(items)
CATS_COUNT = len(item_cats)

In [4]:
import polyglot
from polyglot.detect import Detector
from polyglot.mapping import Embedding
import string

VOCAB_SIZE = 64
embeddings_ru = Embedding.load("data/ru_embeddings_pkl.tar.bz2")
embeddings_en = Embedding.load("data/en_embeddings_pkl.tar.bz2")
punctuation_table = str.maketrans({key: None for key in string.punctuation+string.digits})

def encoder(entries):
    encoded = []
    for i,entry in enumerate(entries.tolist()):
        entry = entry.translate(punctuation_table)

        temp = []
        for word in entry.split(" "):
            if word.replace(" ", "") in embeddings_en:
                temp.append(embeddings_en[word])
            elif word.replace(" ", "") in embeddings_ru:
                temp.append(embeddings_ru[word]) 
            else:
                temp.append(np.array([0]*64)) 
        temp = np.array(temp).mean(axis=0)
        encoded.append(temp)
    return encoded

shop_vec = encoder(shops.shop_name)
item_vec = encoder(items.item_name)
cat_vec = encoder(item_cats.item_category_name)

shops['shop_vec'] = shop_vec
items['item_vec'] = item_vec
item_cats['cat_vec'] = cat_vec

In [5]:
def preprocessing(dt):
    # add feature month to train data
    dt['month'] = dt.date_block_num % 12
    dt['item_category_id'] = dt.join(items, on='item_id', how='left', lsuffix='item_id').item_category_id
    dt['item_vec'] = dt.join(items, on='item_id', how='left', rsuffix='ref').item_vec
    dt['cat_vec'] = dt.join(item_cats, on='item_category_id', how='left', rsuffix='ref').cat_vec
    dt['shop_vec'] = dt.join(shops, on='shop_id', how='left', rsuffix='ref').shop_vec
    return dt

In [6]:
X = pd.DataFrame(data.groupby(['date_block_num','shop_id', 'item_id'])['item_cnt_day'].sum()).reset_index()
X['item_price'] = pd.DataFrame(
    data.groupby(['date_block_num','shop_id', 'item_id'])['item_price'].mean()).reset_index().item_price
X = preprocessing(X)

In [7]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Embedding, Input, Concatenate, Flatten, BatchNormalization, Activation, Dropout, Lambda
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard,TerminateOnNaN
from keras import optimizers, initializers
from keras.backend import sqrt
from keras.losses import mean_squared_error
from datetime import datetime


# create training inputs and target
x = X[['date_block_num','month','item_price',
       'item_id', 'item_category_id', 'shop_id',
       'item_vec','cat_vec','shop_vec','item_cnt_day']].values
inputs = [x[:,i].tolist() for i in range(x.shape[1]-1)]
y = x[:,-1]

# training spec
keras.backend.clear_session()
NUM_EPOCHS = 500
LEARNING_RATE= 0.001
BETA1=0.9
adam = optimizers.Adam(lr=LEARNING_RATE, beta_1=BETA1)

def build_model():
    #  features: 'date_block_num','month','price','item_vec','cat_vec','shop_vec'
    #  input layers
    date = Input(shape=(1,), name='date_input')
    month = Input(shape=(1,), name='month_input', dtype='int32')
    price =  Input(shape=(1,), name='price_input')
    
    item_id = Input(shape=(1,), name='item_id_input', dtype='int32')
    cat_id = Input(shape=(1,), name='cat_id_input', dtype='int32')
    shop_id = Input(shape=(1,), name='shop_id_input', dtype='int32')
    
    item = Input(shape=(64,), name='item_input')
    cat = Input(shape=(64,), name='category_input')
    shop = Input(shape=(64,), name='shop_input')
    
    # embedding layers
    month_emb = Embedding(input_dim=12, output_dim=2, input_length=1, name='month_emb')(month)
    month_flat = Flatten(name='month_flat')(month_emb)
    
    item_emb = Embedding(input_dim=ITEMS_COUNT, output_dim=16, input_length=1, name='item_emb')(item_id)
    item_flat = Flatten(name='item_flat')(item_emb)
    
    cat_emb = Embedding(input_dim=CATS_COUNT, output_dim=4, input_length=1, name='cat_emb')(cat_id)
    cat_flat = Flatten(name='cat_flat')(cat_emb)
    
    shop_emb = Embedding(input_dim=SHOPS_COUNT, output_dim=4, input_length=1, name='shop_emb')(shop_id)
    shop_flat = Flatten(name='shop_flat')(shop_emb)
    
    
    # all inputs concatenation
    inputs = Concatenate(axis=-1, name='inputs_concat')([date, month_flat, price, 
                                                         item_flat, cat_flat, shop_flat, 
                                                         item, cat, shop])
    inputs_batch = BatchNormalization(name='inputs_batchnorm')(inputs)
    
    # dnn layers
    preds = Dense(64, activation='relu', name='dense1')(inputs_batch)
    preds = Dense(32, activation='relu',name='dense2')(preds)
    preds = BatchNormalization(name='batchnorm1')(preds)
    preds = Dense(16, activation='relu', name='dense3')(preds)

    # output layer
    preds = Dense(1, activation='relu', name='final_out')(preds)

    return Model(inputs=[date, month, price, 
                         item_id, cat_id, shop_id, 
                         item, cat, shop], outputs=preds)
    
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred)+0.00001)

model = build_model()
model.summary()
model.compile(optimizer = adam,loss=rmse, metrics=[rmse])

OUTPUT_DIR = './trained_model/'+ 'lr' + str(LEARNING_RATE) + '_' + datetime.now().strftime("%dd%H-%M")
filepath = OUTPUT_DIR +'/' + "weights-improvement-{epoch:02d}-{val_rmse:.6f}.hdf5"

# model = load_model('keras/weights-improvement-02-14.970410.hdf5')
# model.load_weights('trained_model/lr0.001_09d11-32/weights-improvement-22-0.827185.hdf5')

callbacks = [
             TerminateOnNaN(),
             ModelCheckpoint(filepath=filepath, monitor='val_rmse', verbose=1, period=1, save_best_only=True),
             EarlyStopping(patience=2, monitor='loss'),
             TensorBoard(log_dir=OUTPUT_DIR, write_images=False, histogram_freq=1, write_grads=True),
#              keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0),
             keras.callbacks.CSVLogger('log.csv', separator=',', append=False)
]

model.fit(inputs, y, batch_size = 2048, epochs=NUM_EPOCHS, callbacks=callbacks, shuffle=True,
          validation_split=0.01)


Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
month_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
item_id_input (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
cat_id_input (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
shop_id_input (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
month_emb 


Epoch 00009: val_rmse did not improve from 0.81523
Epoch 10/500

Epoch 00010: val_rmse improved from 0.81523 to 0.79707, saving model to ./trained_model/lr0.001_12d13-39/weights-improvement-10-0.797068.hdf5
Epoch 11/500

Epoch 00011: val_rmse improved from 0.79707 to 0.79321, saving model to ./trained_model/lr0.001_12d13-39/weights-improvement-11-0.793214.hdf5
Epoch 12/500

Epoch 00012: val_rmse improved from 0.79321 to 0.78092, saving model to ./trained_model/lr0.001_12d13-39/weights-improvement-12-0.780919.hdf5
Epoch 13/500

Epoch 00013: val_rmse did not improve from 0.78092
Epoch 14/500

Epoch 00014: val_rmse improved from 0.78092 to 0.77538, saving model to ./trained_model/lr0.001_12d13-39/weights-improvement-14-0.775377.hdf5
Epoch 15/500

Epoch 00015: val_rmse did not improve from 0.77538
Epoch 16/500

Epoch 00016: val_rmse did not improve from 0.77538
Epoch 17/500

Epoch 00017: val_rmse improved from 0.77538 to 0.75372, saving model to ./trained_model/lr0.001_12d13-39/weights-im


Epoch 00045: val_rmse did not improve from 0.70598
Epoch 46/500

Epoch 00046: val_rmse did not improve from 0.70598
Epoch 47/500

Epoch 00047: val_rmse did not improve from 0.70598
Epoch 48/500

Epoch 00048: val_rmse did not improve from 0.70598
Epoch 49/500

Epoch 00049: val_rmse did not improve from 0.70598
Epoch 50/500

Epoch 00050: val_rmse did not improve from 0.70598
Epoch 51/500

Epoch 00051: val_rmse did not improve from 0.70598
Epoch 52/500

Epoch 00052: val_rmse did not improve from 0.70598
Epoch 53/500

Epoch 00053: val_rmse did not improve from 0.70598
Epoch 54/500

Epoch 00054: val_rmse did not improve from 0.70598
Epoch 55/500

Epoch 00055: val_rmse did not improve from 0.70598
Epoch 56/500

Epoch 00056: val_rmse did not improve from 0.70598
Epoch 57/500

Epoch 00057: val_rmse did not improve from 0.70598
Epoch 58/500

Epoch 00058: val_rmse improved from 0.70598 to 0.70065, saving model to ./trained_model/lr0.001_12d13-39/weights-improvement-58-0.700651.hdf5
Epoch 59/500

<keras.callbacks.History at 0x1fcc5555390>

In [8]:
X_test = pd.read_csv('data/test.csv', dtype={'shop_id': np.int32, 'item_id': np.int32})
X_test['date_block_num'] = 34
X_test['month'] = 11
X_test = preprocessing(X_test)

# find the nearest month
ref_month = pd.merge(X_test,X,
                     how='left',
                     on=['shop_id', 'item_id'], 
                     suffixes=['_test','_train']).groupby(['shop_id', 'item_id'])['date_block_num_train'].max().reset_index().rename(columns={'date_block_num_train':'date_block_num'})

# query price of nearest month
refs = pd.merge(ref_month, X,
                how='left',
                on=['shop_id', 'item_id', 'date_block_num'],)[['shop_id', 'item_id', 'item_price']]

# fill NAs using predictions from price_table
missing = refs.loc[refs.item_price.isnull(), ['shop_id','item_id']]
price_table = pd.read_csv('price_table.csv')
missing_price = pd.merge(missing, price_table, on=('shop_id','item_id')).item_price
refs.loc[refs.item_price.isnull(), 'item_price'] = missing_price
                         
X_test['item_price'] = pd.merge(X_test, refs, how='left', on=['shop_id', 'item_id']).item_price

In [9]:
from keras.models import load_model

x_test = X_test[['date_block_num','month','item_price',
       'item_id', 'item_category_id', 'shop_id',
       'item_vec','cat_vec','shop_vec']].values
inputs_test = [x_test[:,i].tolist() for i in range(x_test.shape[1])]

from keras.losses import mean_squared_error

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

# model = load_model('trained_model/\lr0.001_11d17-21\weights-improvement-11-0.860732.hdf5', {'rmse':rmse, 'sqrt':sqrt})

y_out = model.predict(inputs_test, verbose=1).flatten().tolist()
y_out = [20 if i>20 else i for i in y_out]

import csv
with open('predictions.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(('ID','item_cnt_month'))
    for i in range(len(y_out)):
        writer.writerow((i, y_out[i]))

