In [1]:
########################################
## import packages
########################################

import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape, Lambda, Flatten
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD
from keras.constraints import Constraint, unit_norm, non_neg
from keras import backend as K
from numpy import newaxis

Using TensorFlow backend.


In [2]:
########################################
## load the data
########################################

train = pd.read_csv('./data/train.csv')
uid = train.msno
sid = train.song_id
target = train.target

test = pd.read_csv('./data/test.csv')
id_test = test.id
uid_test = test.msno
sid_test = test.song_id

In [3]:
def encode_column(col_name):
    train[col_name] = train[col_name].astype(str)
    test[col_name] = test[col_name].astype(str)
    encoder = LabelEncoder()
    encoder.fit(train[col_name].append(test[col_name]))
    train[col_name] = encoder.transform(train[col_name])
    test[col_name] = encoder.transform(test[col_name])
    return int(max(train[col_name].max(), test[col_name].max()) + 1)

In [4]:
u_cnt = encode_column('msno')

In [5]:
s_cnt = encode_column('song_id')

In [6]:
sst_cnt = encode_column('source_system_tab')

In [7]:
ssn_cnt = encode_column('source_screen_name')

In [8]:
st_cnt = encode_column('source_type')

In [58]:
def get_model():
    EMBEDDING_SIZE = 64
    
    def outer_product(inputs):
        """
        inputs: list of two tensors (of equal dimensions, 
            for which you need to compute the outer product
        """
        x, y = inputs
        return K.dot(K.reshape(x, (-1, 64, 1)), K.reshape(y, (-1, 1, 64)), axis=1)
        
    def embedding(cnt, size):        
        inp = Input(shape=(1,), dtype='int32')
        emb = Embedding(cnt,
            size,
            input_length=1, 
            #embeddings_constraint=non_neg(),
            #embeddings_constraint=unit_norm(1),                    
            embeddings_regularizer=l2(1e-4),
            trainable=True)(inp)
        emb = Reshape((size,))(emb)
        emb = Dense(size, activation='selu')(emb)
        emb = Dense(size, activation='selu')(emb)
        return inp, emb
    
    user_inp, user_emb = embedding(u_cnt, EMBEDDING_SIZE)
    song_inp, song_emb = embedding(s_cnt, EMBEDDING_SIZE)            
    sst_inp, sst_emb = embedding(sst_cnt, 2)
    ssn_inp, ssn_emb = embedding(ssn_cnt, 2)
    st_inp, st_emb = embedding(st_cnt, 2)
    
    DROPOUT=0
    
    preds = dot([user_emb, song_emb], axes=1)
    preds = concatenate([user_emb, song_emb, preds, sst_emb, ssn_emb, st_emb])
    preds = Dense(EMBEDDING_SIZE, activation='relu')(preds)
    preds = Dense(EMBEDDING_SIZE, activation='relu')(preds)
    preds = Dropout(DROPOUT)(preds)
    preds = Dense(1, activation='sigmoid')(preds)

    #preds = dot([embedded_usr, embedded_song], axes=1)
    #preds = Lambda(outer_product, output_shape=(EMBEDDING_SIZE**2, ))([embedded_usr, embedded_song])
    #print(preds.shape)
    #preds = Flatten()(preds)
    #preds = Dense(EMBEDDING_SIZE*EMBEDDING_SIZE, activation='selu', use_bias=False)(preds)
    #preds = Dense(1, activation='sigmoid', use_bias=False)(preds)


    model = Model(inputs=[user_inp, song_inp, sst_inp, ssn_inp, st_inp], outputs=preds)
    
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=1e-3), metrics=['acc'])
    
    print(model.summary())

    return model

def get_model_2():
    user_embeddings = Embedding(u_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)
    song_embeddings = Embedding(s_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)

    uid_input = Input(shape=(1,), dtype='int32')
    embedded_usr = user_embeddings(uid_input)
    embedded_usr = Reshape((64,))(embedded_usr)

    sid_input = Input(shape=(1,), dtype='int32')
    embedded_song = song_embeddings(sid_input)
    embedded_song = Reshape((64,))(embedded_song)

    preds = dot([embedded_usr, embedded_song], axes=1)
    preds = concatenate([embedded_usr, embedded_song, preds])
    
    preds = Dense(128, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(1, activation='sigmoid')(preds)

    model = Model(inputs=[uid_input, sid_input], outputs=preds)
    
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

    return model

In [59]:
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=5)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
        save_weights_only=True)

train = train.sample(frac=1).reset_index(drop=True)
hist = model.fit(
    [train.msno, train.song_id, train.source_system_tab, train.source_screen_name, train.source_type], 
    train.target, 
    validation_split=0.2, 
    epochs=20000, 
    batch_size=32768, 
    shuffle=True,
    callbacks=[early_stopping, model_checkpoint]
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_102 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_103 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_102 (Embedding)       (None, 1, 64)        2201792     input_102[0][0]                  
__________________________________________________________________________________________________
embedding_103 (Embedding)       (None, 1, 64)        26869696    input_103[0][0]                  
__________________________________________________________________________________________________
input_104 

Epoch 8/20000
Epoch 9/20000
Epoch 10/20000
Epoch 11/20000
Epoch 12/20000
Epoch 13/20000
Epoch 14/20000
Epoch 15/20000
Epoch 16/20000
Epoch 17/20000
Epoch 18/20000
Epoch 19/20000
Epoch 20/20000
Epoch 21/20000
Epoch 22/20000
Epoch 23/20000
Epoch 24/20000
Epoch 25/20000


In [60]:
########################################
## make the submission
########################################
model.load_weights(model_path)
val = train.tail(int(len(train) * 0.2))
preds_val = model.predict([val.msno, val.song_id, val.source_system_tab, val.source_screen_name, val.source_type], batch_size=32768)
val_auc = roc_auc_score(val.target, preds_val)
print(val_auc)

preds_test = model.predict([test.msno, test.song_id, test.source_system_tab, test.source_screen_name, test.source_type], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': id_test, 'target': preds_test.ravel()})
sub.to_csv('./sub_%.5f.csv'%(val_auc), index=False)

0.810343843117
