In [2]:
from __future__ import absolute_import

import pandas as pd
import numpy as np

data = pd.read_csv('data/train.csv', dtype={'shop_id': np.int32, 'item_id': np.int32, 'item_cnt_day':np.int32})

# vocabularies
shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')
SHOPS_COUNT = len(shops)
ITEMS_COUNT = len(items)
CATS_COUNT = len(item_cats)

import polyglot
from polyglot.detect import Detector
from polyglot.mapping import Embedding
import string

VOCAB_SIZE = 64
embeddings_ru = Embedding.load("data/ru_embeddings_pkl.tar.bz2")
embeddings_en = Embedding.load("data/en_embeddings_pkl.tar.bz2")
punctuation_table = str.maketrans({key: None for key in string.punctuation+string.digits})

def encoder(entries):
    encoded = []
    for i,entry in enumerate(entries.tolist()):
        entry = entry.translate(punctuation_table)

        temp = []
        for word in entry.split(" "):
            if word.replace(" ", "") in embeddings_en:
                temp.append(embeddings_en[word])
            elif word.replace(" ", "") in embeddings_ru:
                temp.append(embeddings_ru[word]) 
            else:
                temp.append(np.array([0]*64)) 
        temp = np.array(temp).mean(axis=0)
        encoded.append(temp)
    return encoded

shop_vec = encoder(shops.shop_name)
item_vec = encoder(items.item_name)
cat_vec = encoder(item_cats.item_category_name)

shops['shop_vec'] = shop_vec
items['item_vec'] = item_vec
item_cats['cat_vec'] = cat_vec

def preprocessing(dt):
    # add feature month to train data
    dt['month'] = dt.date_block_num % 12
    dt['item_category_id'] = dt.join(items, on='item_id', how='left', lsuffix='item_id').item_category_id
    dt['item_vec'] = dt.join(items, on='item_id', how='left', rsuffix='ref').item_vec
    dt['cat_vec'] = dt.join(item_cats, on='item_category_id', how='left', rsuffix='ref').cat_vec
    dt['shop_vec'] = dt.join(shops, on='shop_id', how='left', rsuffix='ref').shop_vec
    return dt

X = pd.DataFrame(data.groupby(['date_block_num','shop_id', 'item_id'])['item_cnt_day'].sum()).reset_index()
X['item_price'] = pd.DataFrame(
    data.groupby(['date_block_num','shop_id', 'item_id'])['item_price'].mean()).reset_index().item_price
X = preprocessing(X)

In [8]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Embedding, Input, Concatenate, Flatten, BatchNormalization, Activation, Dropout, Lambda
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard,TerminateOnNaN
from keras import optimizers, initializers
from keras.backend import sqrt
from keras.losses import mean_squared_error
from datetime import datetime


# create training inputs and target
x = X[['date_block_num','month',
       'item_vec','cat_vec','shop_vec', 'item_price']].values
inputs = [x[:,i].tolist() for i in range(x.shape[1]-1)]
y = x[:,-1]

# training spec
keras.backend.clear_session()
NUM_EPOCHS = 500
LEARNING_RATE= 0.01
BETA1=0.9
adam = optimizers.Adam(lr=LEARNING_RATE, beta_1=BETA1)

def build_model():
    #  features: 'date_block_num','month','price','item_vec','cat_vec','shop_vec'
    #  input layers
    date = Input(shape=(1,), name='date_input')
    month = Input(shape=(1,), name='month_input', dtype='int32')
    
    item = Input(shape=(64,), name='item_input')
    cat = Input(shape=(64,), name='category_input')
    shop = Input(shape=(64,), name='shop_input')
    
    # embedding layers
    month_emb = Embedding(input_dim=12, output_dim=2, input_length=1, name='month_emb')(month)
    month_flat = Flatten(name='month_flat')(month_emb)
    
    # all inputs concatenation
    inputs = Concatenate(axis=-1, name='inputs_concat')([date, month_flat,
                                                         item, cat, shop])
    
    # dnn layers
    preds = Dense(16, activation='relu', name='dense1')(inputs)
    preds = Dense(4, activation='relu',name='dense2')(preds)

    # output layer
    preds = Dense(1, activation='relu', name='final_out')(preds)

    return Model(inputs=[date, month, item, cat, shop], outputs=preds)
    
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred)+0.00001)

model = build_model()
model.summary()
model.compile(optimizer = adam,loss=rmse, metrics=[rmse])

OUTPUT_DIR = './price_trained_model/'+ 'lr' + str(LEARNING_RATE) + '_' + datetime.now().strftime("%dd%H-%M")
filepath = OUTPUT_DIR +'/' + "weights-improvement-{epoch:02d}-{val_rmse:.6f}.hdf5"

# model = load_model('keras/weights-improvement-02-14.970410.hdf5')
# model.load_weights('trained_model/lr0.001_09d11-32/weights-improvement-22-0.827185.hdf5')

callbacks = [
             TerminateOnNaN(),
             ModelCheckpoint(filepath=filepath, monitor='val_rmse', verbose=1, period=1, save_best_only=True),
             EarlyStopping(patience=2, monitor='val_loss'),
             TensorBoard(log_dir=OUTPUT_DIR, write_images=False, histogram_freq=1, write_grads=True),
#              keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0),
             keras.callbacks.CSVLogger('log.csv', separator=',', append=False)
]

model.fit(inputs, y, batch_size = 2048, epochs=NUM_EPOCHS, callbacks=callbacks, shuffle=True,
          validation_split=0.01)


X_test = pd.read_csv('data/test.csv', dtype={'shop_id': np.int32, 'item_id': np.int32})
X_test['date_block_num'] = 34
X_test['month'] = 11
X_test = preprocessing(X_test)

x_test = X_test[['date_block_num','month',
       'item_vec','cat_vec','shop_vec']].values

inputs_test = [x_test[:,i].tolist() for i in range(x_test.shape[1])]
X_test['item_price'] = model.predict(inputs_test, verbose=1).flatten().tolist()

X_test[['shop_id', 'item_id','item_price']].to_csv('price_table.csv')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
month_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
month_emb (Embedding)           (None, 1, 2)         24          month_input[0][0]                
__________________________________________________________________________________________________
date_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
month_flat (Flatten)            (None, 2)            0           month_emb[0][0]                  
__________________________________________________________________________________________________
item_input

Epoch 18/500

Epoch 00018: val_rmse did not improve from 435.75150




KeyError: "['item_price'] not in index"