In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape, Lambda, Flatten
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD
from keras.constraints import Constraint, unit_norm, non_neg
from keras import backend as K
from numpy import newaxis

Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/train_features.csv')
test = pd.read_csv('data/test_features.csv')

In [3]:
SONG_FEAT = ['song_id', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']
USER_FEAT = [ 'msno', 'city', 'gender', 'registered_via', 'bd_invalid']
SOURCE_FEAT = ['source_system_tab', 'source_screen_name', 'source_type']

label_columns = SOURCE_FEAT + SONG_FEAT + USER_FEAT
label_count = {}
for k in label_columns:
    label_count[k] = max(max(train[k]), max(test[k]))+1

In [4]:
def get_model():
    EMBEDDING_SIZE = 4
    
    def outer_product(inputs):
        """
        inputs: list of two tensors (of equal dimensions, 
            for which you need to compute the outer product
        """
        x, y = inputs
        return K.dot(K.reshape(x, (-1, 64, 1)), K.reshape(y, (-1, 1, 64)), axis=1)
        
    def embedding(cnt, size):        
        inp = Input(shape=(1,), dtype='int32')
        emb = Embedding(cnt,
            size,
            input_length=1, 
            #embeddings_constraint=non_neg(),
            #embeddings_constraint=unit_norm(1),                    
            embeddings_regularizer=l2(1e-3),
            trainable=True)(inp)
        emb = Flatten()(emb)
        #emb = Dense(size, activation='selu', use_bias=False)(emb)
        #emb = Dense(size, activation='selu', use_bias=False)(emb)
        return inp, emb
    

    label_inp = {}
    label_emb = {}
    
    for label in label_columns:
        label_inp[label], label_emb[label] = embedding(label_count[label], max(1, min(EMBEDDING_SIZE, label_count[label] // 16)))
            
    DROPOUT=0
    
    norm_inp = {}
    norm_inp['bd_norm'] = Input(shape=(1,), dtype='float32')
    norm_inp['song_length_norm'] = Input(shape=(1,), dtype='float32')
        
    song_emb = concatenate([label_emb[label] for label in SONG_FEAT] + [norm_inp['song_length_norm']])
    #song_emb = Dense(EMBEDDING_SIZE, activation='selu')(label_emb['song_id'])
    song_emb = Dense(EMBEDDING_SIZE, activation='selu', use_bias=False)(song_emb)
    song_emb = Dense(EMBEDDING_SIZE, activation='selu', use_bias=False)(song_emb)
    
    user_emb = concatenate([label_emb[label] for label in USER_FEAT] + [norm_inp['bd_norm']])
    user_emb = Dense(EMBEDDING_SIZE, activation='selu', use_bias=False)(user_emb)
    user_emb = Dense(EMBEDDING_SIZE, activation='selu', use_bias=False)(user_emb)
    
    preds = dot([user_emb, song_emb], axes=1)

    preds = concatenate([preds, user_emb, song_emb] + [label_emb[label] for label in SOURCE_FEAT])
    #preds = concatenate([user_emb, song_emb, preds])
    preds = Dense(128, activation='selu', use_bias=False)(preds)
    #preds = Dropout(DROPOUT)(preds)
    preds = Dense(128, activation='selu', use_bias=False)(preds)
    #preds = Dropout(DROPOUT)(preds)
    preds = Dense(1, activation='sigmoid')(preds)

    #preds = dot([embedded_usr, embedded_song], axes=1)
    #preds = Lambda(outer_product, output_shape=(EMBEDDING_SIZE**2, ))([embedded_usr, embedded_song])
    #print(preds.shape)
    #preds = Flatten()(preds)
    #preds = Dense(EMBEDDING_SIZE*EMBEDDING_SIZE, activation='selu', use_bias=False)(preds)
    #preds = Dense(1, activation='sigmoid', use_bias=False)(preds)

    model = Model(inputs=[label_inp[label] for label in label_columns] + [norm_inp['bd_norm'], norm_inp['song_length_norm']], outputs=preds)
    
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=1e-3), metrics=['acc'])
    
    print(model.summary())

    return model

In [5]:
import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

class RocMetrics(Callback):
    def on_epoch_end(self, epoch, logs={}):
        print(len(self.validation_data))
        preds = self.model.predict(self.validation_data[:-2], batch_size=32768)
        val_auc = roc_auc_score(self.validation_data[-2], preds)
        print(" - val_roc: %f" %(val_auc))
    
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=5)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
        save_weights_only=True)

hist = model.fit(
    [train[label] for label in label_columns] + [train.bd_norm, train.song_length_norm], 
    train.target, 
    validation_split=0.2, 
    epochs=20000, 
    batch_size=32768, 
    shuffle=True,
    callbacks=[early_stopping, model_checkpoint]
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_14 (

Train on 5901934 samples, validate on 1475484 samples
Epoch 1/20000
Epoch 2/20000
Epoch 3/20000
Epoch 4/20000
Epoch 5/20000
Epoch 6/20000
Epoch 7/20000
Epoch 8/20000
Epoch 9/20000
Epoch 10/20000
Epoch 11/20000
Epoch 12/20000
Epoch 13/20000
Epoch 14/20000
Epoch 15/20000
Epoch 16/20000
Epoch 17/20000
Epoch 18/20000
Epoch 19/20000
Epoch 20/20000


In [6]:
########################################
## make the submission
########################################
model.load_weights(model_path)
val = train.tail(int(len(train) * 0.2))
preds_val = model.predict([val[label] for label in label_columns] + [val.bd_norm, val.song_length_norm], batch_size=32768)
val_auc = roc_auc_score(val.target, preds_val)
print(val_auc)

preds_test = model.predict([test[label] for label in label_columns] + [test.bd_norm, test.song_length_norm], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': test.id, 'target': preds_test.ravel()})
sub.to_csv('./sub_%.5f.csv'%(val_auc), index=False)

0.662573227892
