In [1]:
import pandas as pd
import numpy as np
np.random.seed(123)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model as KerasModel
from tensorflow.keras.layers import Input, Dense, Activation, Reshape
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import mean_squared_error
from math import sqrt

import pickle

In [2]:
f = open('col_types.pickle', 'rb')
(cat_cols, num_cols) = pickle.load(f)

In [3]:
len(cat_cols) + len(num_cols)

40

In [4]:
f = open('feature_train_data_v2.pickle', 'rb')
(X, y1, y2) = pickle.load(f)

In [5]:
train_size = len(X) - 366202 # use tr4 as validation set
train_size

1051373

In [6]:
X_train = X[:train_size]
X_val = X[train_size:]
y_train = y1[:train_size]
y_val = y1[train_size:]

In [7]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = np.random.randint(num_row, size=n)
    return X.iloc[indices, :], y[indices]

In [8]:
#X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
#print("Number of samples used for training: " + str(y_train.shape[0]))

In [9]:
class Model(object):

    def evaluate(self, X_val, y_val):
        y_val[y_val < 0] = 0
        guessed_sales = self.guess(X_val)       
        result = sqrt(mean_squared_error(guessed_sales, y_val))
        return result

In [10]:
def split_features(X):
    X_list = []
    for i in range(X.shape[1]):  
         
        X_list.append(X.iloc[:,i].values)
        
    return X_list

In [11]:
# embedding size (cat data)
def emb_size(n_cat):
   # min(50, (n_cat//2)+1)
   return min(round(1.6*n_cat**0.56), 60)
  
def embed_shape(ps):
    n_cat = max(ps) + 1 
    size = emb_size(n_cat)
    return [n_cat, size]

In [12]:
#drop_cols = ['fullVisitorId', 'networkDomain','gclId']
#cat_cols = [x for x in cat_cols if x not in drop_cols]
#X_train.drop(drop_cols, axis=1,inplace=True)
#X_val.drop(drop_cols, axis=1,inplace=True)

In [17]:
class NN_with_EntityEmbeddings(Model):
    def __init__(self, X_train, y_train, X_val, y_val, cat_cols, num_cols):
        super().__init__()
        self.epochs = 3   
        self.cat_cols = cat_cols
        self.num_cols = num_cols
       #self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(np.max(y_train),np.max(y_val))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)        
    
    def preprocessing(self, X):
        X_list = split_features(X)
        return X_list
    
    def __build_keras_model(self):
        # embedding layer
          
        input_model = []        
        output_embeddings = []
        for col in X_train.columns:
            if col in num_cols:
                input_num = Input(shape=(1,),dtype='float32')
                output_num = Dense(1)(input_num)
            
                input_model.append(input_num)
                output_embeddings.append(output_num)
            
            elif col in cat_cols:
                input_cat = Input(shape=(1,),dtype='float32')
                x_all = pd.concat([X_train[col], X_val[col]])
                ncat, entities = embed_shape(x_all)
                print(col, ncat, entities)
                output_cat = Embedding(ncat, entities, name = col)(input_cat)                
                output_cat = Reshape(target_shape=(entities,))(output_cat)
            
                input_model.append(input_cat)
                output_embeddings.append(output_cat)   
        
    
        output_model = Concatenate()(output_embeddings)
        #print(output_model)
        # Layer 1
        output_model = Dense(1000, kernel_initializer='uniform')(output_model)        
        output_model = Activation('relu')(output_model)
        print(output_model)
        # Layer 2
        output_model = Dense(500, kernel_initializer='uniform')(output_model)       
        output_model = Activation('relu')(output_model)
        # Output layer
        output_model = Dense(1)(output_model)       
        output_model = Activation('sigmoid')(output_model)
        
        self.model = KerasModel(inputs=input_model, outputs=output_model)
        
        self.model.compile(loss='mean_absolute_error',optimizer='adam')
           
    def _val_for_fit(self, val):
        val = val / self.max_log_y
        return val
       
    def _val_for_pred(self, val):
        return val * self.max_log_y
       
    def fit(self, X_train, y_train, X_val, y_val):
        #self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train),
        #               validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)),
        #               epochs=self.epochs, batch_size=64,
        #               callbacks=[self.checkpointer],
        #               )        
        self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))
        
    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)        

In [18]:
NN_with_EntityEmbeddings(X_train, y_train, X_val, y_val, cat_cols, num_cols)

fullVisitorId 1396370 60
networkDomain 42952 60
city 980 60
operatingSystem 26 10
metro 124 24
region 489 51
channelGrouping 8 5
referralPath 3099 60
country 228 33
source 348 42
medium 7 5
keyword 4512 60
browser 135 25
gclId 54282 60
deviceCategory 3 3
continent 6 4
sessionQualityDim 100 21
Tensor("activation_6/Relu:0", shape=(None, 1000), dtype=float32)
Result on validation data:  0.31876424873585285


<__main__.NN_with_EntityEmbeddings at 0x7f32bb488f60>