# 1. Import packages

In [31]:
import time
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape, Lambda
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD

In [2]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-12-12 21:45:06


In [3]:
# Define categorical columns in the data frame
def _CAT_NUM():
    global CATEGORICAL, NUMERICAL
    CATEGORICAL = [
        'msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type', 
        'genre_ids', 'artist_name', 'composer', 'lyricist', 'language', 
        'city', 'gender', 'registered_via',
    ]
    UNDECIDED = [
        'song_year',
        'registration_year', 'registration_month', 'registration_day', 'expiration_year', 'expiration_month', 'expiration_day'
    ]
    NUMERICAL = [
        'song_length', 'age', 'weird_age', 'validate_days',
        'count_song_played', 'count_artist_played', 
        'genre_count', 'lyricist_count', 'composer_count', 'artist_count', 
        'is_featured', 'artist_composer', 'artist_composer_lyricist', 'song_lang_boolean', 'smaller_song'
    ]
    NUMERICAL += UNDECIDED

def _check_CAT_NUM():
    print(len(df_train.columns),
          len(set(CATEGORICAL).union(NUMERICAL)),
          len(set(CATEGORICAL).intersection(NUMERICAL)))


_CAT_NUM()

# 2. Load data and feature engineering

In [4]:
input_dir = '../input/'
df_train = pd.read_csv(input_dir + "train.csv")
df_test  = pd.read_csv(input_dir + 'test.csv')
df_songs = pd.read_csv(input_dir + 'songs.csv')
df_song_extra = pd.read_csv(input_dir + "song_extra_info.csv")
df_members = pd.read_csv(input_dir + "members.csv", parse_dates=["registration_init_time","expiration_date"])

In [5]:
df_train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target'],
      dtype='object')

## 2.0 Simple member related and song related

In [6]:
# member related
df_members.rename(columns={'bd': 'age'}, inplace=True)
df_members.loc[df_members['age'] < 5, 'age'] = 0
df_members.loc[df_members['age'] >= 80, 'age'] = 0
df_members['weird_age'] = 0
df_members.loc[df_members['age'] == 0, 'weird_age'] = 1
df_members['validate_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days.astype(int)

df_members['registration_year'] = df_members['registration_init_time'].dt.year.astype(int)
df_members['registration_month'] = df_members['registration_init_time'].dt.month.astype(int)
df_members['registration_day'] = df_members['registration_init_time'].dt.day.astype(int)

df_members['expiration_year'] = df_members['expiration_date'].dt.year.astype(int)
df_members['expiration_month'] = df_members['expiration_date'].dt.month.astype(int)
df_members['expiration_day'] = df_members['expiration_date'].dt.day.astype(int)

df_members.drop(['registration_init_time', 'expiration_date'], axis=1, inplace=True)

In [7]:
# song related
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

df_song_extra['song_year'] = df_song_extra['isrc'].apply(isrc_to_year)
df_song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

# 1000 <=> 1s
df_songs['song_length'] /= 1000.0
df_songs.loc[df_songs['song_length'] > 1800, 'song_length'] = 1800

## 2.1 Join

In [8]:
# left join train and song based on song_id
# left join train and members based on msno(user id)
# left join train and song_extra based on song_id
df_train = df_train.merge(df_songs, how="left", on="song_id")
df_train = df_train.merge(df_members, how="left", on="msno")
df_train = df_train.merge(df_song_extra, how='left', on='song_id')

df_test  = df_test.merge(df_songs, how="left", on="song_id")
df_test  = df_test.merge(df_members, how="left", on="msno")
df_test = df_test.merge(df_song_extra, how='left', on='song_id')

In [9]:
# for col in ['registered_via', 'city', 'language']:
#     df_train[col] = df_train[col].astype(str)
#     df_test[col] = df_test[col].astype(str)

## 2.2 Missing value

In [10]:
UNKNOWN = 'Unknown'
col_fill_with_unknown = [
    'source_system_tab', 'source_screen_name', 'source_type', 
    'gender', 
    'genre_ids', 'artist_name', 'composer', 'lyricist', 'language'
]
for col in col_fill_with_unknown:
    df_train[col].fillna(value=UNKNOWN, inplace=True)
    df_test[col].fillna(value=UNKNOWN, inplace=True)

df_train['song_length'].fillna(value=230,inplace=True)
df_test['song_length'].fillna(value=230,inplace=True)

fill_in_value = df_train['song_year'].median()
df_train['song_year'].fillna(value=fill_in_value, inplace=True)
df_test['song_year'].fillna(value=fill_in_value, inplace=True)

## 2.3 Count and binary features

In [11]:
def _count(x):
    if x == UNKNOWN:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';', '、', ','])) + 1

df_train['genre_count'] = df_train['genre_ids'].apply(_count).astype(int)
df_test['genre_count'] = df_test['genre_ids'].apply(_count).astype(int)

df_train['lyricist_count'] = df_train['lyricist'].apply(_count).astype(int)
df_test['lyricist_count'] = df_test['lyricist'].apply(_count).astype(int)

df_train['composer_count'] = df_train['composer'].apply(_count).astype(int)
df_test['composer_count'] = df_test['composer'].apply(_count).astype(int)

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
df_train['is_featured'] = df_train['artist_name'].apply(is_featured).astype(np.int8)
df_test['is_featured'] = df_test['artist_name'].apply(is_featured).astype(np.int8)

def artist_count(x):
    if x == UNKNOWN:
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&') + 1
df_train['artist_count'] = df_train['artist_name'].apply(artist_count).astype(np.int8)
df_test['artist_count'] = df_test['artist_name'].apply(artist_count).astype(np.int8)

# if artist is same as composer
df_train['artist_composer'] = (df_train['artist_name'] == df_train['composer']).astype(np.int8)
df_test['artist_composer'] = (df_test['artist_name'] == df_test['composer']).astype(np.int8)

# if artist, lyricist and composer are all three same
df_train['artist_composer_lyricist'] = ((df_train['artist_name'] == df_train['composer']) 
                                        & (df_train['artist_name'] == df_train['lyricist']) 
                                        & (df_train['composer'] == df_train['lyricist'])).astype(np.int8)
df_test['artist_composer_lyricist'] = ((df_test['artist_name'] == df_test['composer']) 
                                       & (df_test['artist_name'] == df_test['lyricist']) 
                                       & (df_test['composer'] == df_test['lyricist'])).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0
df_train['song_lang_boolean'] = df_train['language'].apply(song_lang_boolean).astype(np.int8)
df_test['song_lang_boolean'] = df_test['language'].apply(song_lang_boolean).astype(np.int8)

# smaller song
_mean_song_length = np.mean(df_train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0
df_train['smaller_song'] = df_train['song_length'].apply(smaller_song).astype(np.int8)
df_test['smaller_song'] = df_test['song_length'].apply(smaller_song).astype(np.int8)

## 2.4 Historical statistical features

In [12]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in df_train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in df_test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
df_train['count_song_played'] = df_train['song_id'].apply(count_song_played).astype(np.int64)
df_test['count_song_played'] = df_test['song_id'].apply(count_song_played).astype(np.int64)
del _dict_count_song_played_test
del _dict_count_song_played_train

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in df_train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in df_test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0
df_train['count_artist_played'] = df_train['artist_name'].apply(count_artist_played).astype(np.int64)
df_test['count_artist_played'] = df_test['artist_name'].apply(count_artist_played).astype(np.int64)
del _dict_count_artist_played_train
del _dict_count_artist_played_test

# 2.5 standarize numerical

In [13]:
# Normalize numerical data
for col in NUMERICAL:
    mean = np.mean(df_train[col])
    stdd = np.std(df_train[col])
    df_train[col] = (df_train[col] - mean) / stdd
    df_test[col]  = (df_test[col] - mean) / stdd
    print(col, mean, stdd, sep='\t')

song_length	244.9424990054791	59.95896238525197
age	17.2371374375154	15.560832070689322
weird_age	0.3997230467353212	0.4898413341675067
validate_days	1627.9613096343464	1128.6731207827263
count_song_played	1640.9239731298944	2576.840783303741
count_artist_played	39826.780881332736	69878.12930279176
genre_count	1.0373533125003898	0.29489515136292566
lyricist_count	0.869041716221041	1.2176895831574315
composer_count	1.3887604037076386	1.6258642833509023
artist_count	1.03012286955	0.190183271661
is_featured	0.00206589893646	0.0454051869068
artist_composer	0.0302623492393	0.171308316954
artist_composer_lyricist	0.0170903966672	0.129608313811
song_lang_boolean	0.0335527958427	0.180075000302
smaller_song	0.525266970097	0.499361172121
song_year	2011.447105748922	6.458415090813096
registration_year	2012.7405063668616	3.01886048129776
registration_month	6.8323058284077165	3.7007223813226187
registration_day	15.815322786373226	8.7685480878986
expiration_year	2017.071606895529	0.3982535484067784


# 3. Preparing Data for keras
decide features' data types and encode categorical features

In [14]:
# it would take < 10 min
print(time.strftime("%Y-%m-%d %H:%M:%S"))
# when i will use the information of these again? -- when i embed, i would like to use their size
encoder_dict = dict()
for col in CATEGORICAL:
    print(col, end='...')
    df_train[col] = df_train[col].astype(str)
    df_test[col]  = df_test[col].astype(str)
    encoder = LabelEncoder()
    encoder.fit(df_train[col].append(df_test[col]))
    encoder_dict[col] = encoder
    df_train[col] = encoder.transform(df_train[col])
    df_test[col]  = encoder.transform(df_test[col])
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-12-08 11:46:26
msno...song_id...source_system_tab...source_screen_name...source_type...genre_ids...artist_name...composer...lyricist...language...city...gender...registered_via...2017-12-08 11:52:42


In [15]:
# save
df_train.to_csv('../output/df_train.csv')
df_test.to_csv('../output/df_test.csv')

## start from here if previous steps are executed before

In [4]:
# load
df_train = pd.read_csv('../output/df_train.csv')
df_test  = pd.read_csv('../output/df_test.csv')

## 3.1 Split the data
randomly split the data into two parts (10%/90%) for train/validation.

In [13]:
np.random.seed(42)
perm = np.random.permutation(len(df_train))
# perm = list(range(len(df_train)))

trn_cnt = int(len(df_train) * 0.9)
X_trn = df_train.loc[perm[:trn_cnt], :]
# X_trn = df_train
X_val = df_train.loc[perm[trn_cnt:], :]
X_last = df_train[trn_cnt:]

# 4 Train the memorization DNN: Pseudo MF

In [86]:
# define the model
def mem_model():
    vocab_size = int(max(df_train['msno'].max(), df_test['msno'].max()) + 1)
    user_embeddings = Embedding(
        input_dim = vocab_size,
        output_dim = 64,
        embeddings_initializer = RandomUniform(minval=-0.1, maxval=0.1),
        embeddings_regularizer = l2(1e-4),
        input_length = 1,
        name = 'user_embed',
        trainable=True)
    
    vocab_size = int(max(df_train['song_id'].max(), df_test['song_id'].max()) + 1)
    song_embeddings = Embedding(
        input_dim = vocab_size,
        output_dim = 64,
        embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
        embeddings_regularizer=l2(1e-4),
        input_length=1,
        name = 'song_embed',
        trainable=True)

    # embedding of user id
    uid_input = Input(shape=(1,), dtype='int32')
    embedded_usr = user_embeddings(uid_input)
    embedded_usr = Reshape((64,))(embedded_usr)

    # embedding of song id
    sid_input = Input(shape=(1,), dtype='int32')
    embedded_song = song_embeddings(sid_input)
    embedded_song = Reshape((64,))(embedded_song)

    # dot production of embedded vectors
    preds = dot([embedded_usr, embedded_song], axes=1)
    # concatenate two embedded vectors together (64 + 64) with the dot production (1) as a 129-dim vector
    preds = concatenate([embedded_usr, embedded_song, preds], name='mem_latent')
    
    # 128-dim hidden layer
    preds = Dense(128, activation='relu', name='mem_dense1')(preds)
    # dropout layer
    preds = Dropout(0.5, name='mem_dropout1')(preds)
    
    # output layer
    preds = Dense(1, activation='sigmoid', name='mem_output')(preds)

    model = Model(inputs=[uid_input, sid_input], outputs=preds)
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

    return model

In [71]:
########################################
## train the model
########################################
mem = mem_model()
early_stopping = EarlyStopping(monitor='val_acc', patience=5) # early stop if no val acc improvement for 5 epochs
mem_path = '../model/keras/2_mem_model.h5'
# save best model
model_checkpoint = ModelCheckpoint(mem_path, save_best_only=True, \
        save_weights_only=True)

mem.summary()
hist = mem.fit(
    [X_trn['msno'], X_trn['song_id']], X_trn['target'],
    validation_data=([X_val['msno'], X_val['song_id']], X_val['target']), 
    epochs=30, batch_size=32768, shuffle=True,
    callbacks=[early_stopping, model_checkpoint]
)
mem.load_weights(mem_path) # load the best model

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
user_embed (Embedding)          (None, 1, 64)        2201792     input_13[0][0]                   
__________________________________________________________________________________________________
song_embed (Embedding)          (None, 1, 64)        26869696    input_14[0][0]                   
__________________________________________________________________________________________________
reshape_12

Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


## Produce memorization prediction

In [73]:
def mem_validate(model, X):
    preds_val = model.predict([X['msno'], X['song_id']], batch_size=32768)
    val_auc = roc_auc_score(X['target'], preds_val)
    print(val_auc)
    return val_auc

def mem_produce(model, val_auc):
    preds_test = mem.predict([df_test['msno'], df_test['song_id']], batch_size=32768, verbose=1)
    sub = pd.DataFrame({'id': df_test['id'], 'target': preds_test.ravel()})
    sub.to_csv('../result/sub_' + time.strftime("%Y-%m-%d_%H_%M_%S") + '_%.5f.csv.gz' %(val_auc), 
               compression = 'gzip', index=False)

# print out the validation score
val_auc = mem_validate(mem, X_val)
mem_validate(mem, X_trn)
mem_validate(mem, X_last)

# produce output for Kaggle submission
mem_produce(mem, val_auc)

0.772581301868
0.802930766793
0.771886427379


# 5. Train generalization DNN
to reuse the model layers, use code like `model.layers[0].get_weights()`

In [59]:
# RAW_CATEGORICAL = CATEGORICAL features without user ids (msno) and song ids
RAW_CATEGORICAL = CATEGORICAL.copy()
RAW_CATEGORICAL.remove('msno')
RAW_CATEGORICAL.remove('song_id')
RAW_CATEGORICAL

['source_system_tab',
 'source_screen_name',
 'source_type',
 'genre_ids',
 'artist_name',
 'composer',
 'lyricist',
 'language',
 'city',
 'gender',
 'registered_via']

In [80]:
def gen_model(mem_model):
    global cat
    cat = RAW_CATEGORICAL
    input_layers = dict()
    embed_layers = dict()
    for col in cat:
        # embed the categorical features
        vocab_size = int(max(df_train[col].max(), df_test[col].max()) + 1)
        embed_size = np.power(2, int(np.ceil(np.log2(np.log2(vocab_size)))))
        print('%20s\tvocab: %8d, embed: %4d' % (col, vocab_size, embed_size))
        embed_layers[col] = Embedding(
            input_dim = vocab_size,
            output_dim = embed_size,
            embeddings_initializer = RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer = l2(1e-4),
            input_length = 1,
            name = col+'_embed',
            trainable=True)
        
        input_layers[col] = Input(shape=(1,), name=col+'_input')
        embed_layers[col] = embed_layers[col](input_layers[col])
        embed_layers[col] = Reshape((embed_size,))(embed_layers[col])
    n_num = len(NUMERICAL)
    numerical_input = Input(shape=(n_num,), name='numerical_input')
    # input features concatenates the outputs of embedding layers and numerical features
    preds = concatenate([embed_layers[col] for col in cat] + [numerical_input])
    
    # generalization layers, from features to latent vectors
    preds = Dense(128, activation='relu', name='gen_dense1')(preds)
    # dropout
    preds = Dropout(0.5, name='gen_dropout')(preds)
    # output layer: just linear
    preds = Dense(128, name='gen_dense3')(preds) 
    latent_user = Lambda(lambda x: x[:, :64], name='latent_user')(preds)
    latent_song = Lambda(lambda x: x[:, 64:], name='latent_song')(preds)
    preds = dot([latent_song, latent_user], axes=1)
    preds = concatenate([latent_user, latent_song, preds], name='gen_latent')
    
    ############################################
    ## inheritate layers from memorization model
    ############################################
    mem_layer_trainable = False # fixed
    # mem_layer_trainable = True # not fixed
    
    layer = mem_model.get_layer('mem_dense1')
    layer.trainable = mem_layer_trainable
    preds = layer(preds)
    layer = mem_model.get_layer('mem_dropout1')
    layer.trainable = mem_layer_trainable
    preds = layer(preds)
    layer = mem_model.get_layer('mem_output')
    layer.trainable = mem_layer_trainable
    preds = layer(preds)
    
    input_list = [input_layers[col] for col in cat] + [numerical_input]

    model = Model(inputs=input_list, outputs=preds)
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
    return model

def gen_input(X):
    # generate feature matrix from the data frame
    return [X[col] for col in cat] + [X.loc[:, NUMERICAL]]

In [81]:
gen = gen_model(mem)
gen.summary()

   source_system_tab	vocab:       10, embed:    4
  source_screen_name	vocab:       22, embed:    8
         source_type	vocab:       13, embed:    4
           genre_ids	vocab:      609, embed:   16
         artist_name	vocab:    46373, embed:   16
            composer	vocab:    86438, embed:   32
            lyricist	vocab:    37876, embed:   16
            language	vocab:       11, embed:    4
                city	vocab:       21, embed:    8
              gender	vocab:        3, embed:    2
      registered_via	vocab:        6, embed:    4
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
source_system_tab_input (InputL (None, 1)            0                                            
__________________________________________________________________________________________________
source_screen_name_input (Input (None, 1)            0

In [82]:
early_stopping = EarlyStopping(monitor='val_acc', patience=5) # early stop
gen_path = '../model/keras/gen_model.h5'
model_checkpoint = ModelCheckpoint(gen_path, save_best_only=True, save_weights_only=True) # save best models

# training
hist = gen.fit(
    gen_input(X_trn), X_trn['target'],
    validation_data=(gen_input(X_val), X_val['target']), 
    epochs=100, batch_size=32768, shuffle=True,
    callbacks=[early_stopping, model_checkpoint]
)
# load the best model
gen.load_weights(gen_path)

Train on 6639676 samples, validate on 737742 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


In [83]:
def gen_validate(model, X):
    preds_val = model.predict(gen_input(X), batch_size=32768)
    val_auc = roc_auc_score(X['target'], preds_val)
    print(val_auc)
    return val_auc

In [84]:
def gen_produce(model, val_auc):
    preds_test = model.predict(gen_input(df_test), batch_size=32768, verbose=1)
    sub = pd.DataFrame({'id': df_test['id'], 'target': preds_test.ravel()})
    sub.to_csv('../result/sub_' + time.strftime("%Y-%m-%d_%H_%M_%S") + '_%.5f.csv.gz' %(val_auc), 
               compression = 'gzip', index=False)

In [85]:
# validate auc
val_auc = gen_validate(gen, X_val)
gen_validate(gen, X_last)

# generate output for Kaggle submission
gen_produce(gen, val_auc)

0.718293225796
0.674239553462


# 6. Other Experiments Related

## 6.1 DNN

In [10]:
cat = RAW_CATEGORICAL # without uid & sid
# cat = CATEGORICAL # with uid & sid

def dnn_model():
    input_layers = dict()
    embed_layers = dict()
    for col in cat:
        # embedding layers for categorical features
        vocab_size = int(max(df_train[col].max(), df_test[col].max()) + 1)
        embed_size = np.power(2, int(np.ceil(np.log2(np.log2(vocab_size)))))
        print('%20s\tvocab: %8d, embed: %4d' % (col, vocab_size, embed_size))
        embed_layers[col] = Embedding(
            input_dim = vocab_size,
            output_dim = embed_size,
            embeddings_initializer = RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer = l2(1e-4),
            input_length = 1,
            name = col+'_embed',
            trainable=True)
        
        input_layers[col] = Input(shape=(1,), name=col+'_input')
        embed_layers[col] = embed_layers[col](input_layers[col])
        embed_layers[col] = Reshape((embed_size,))(embed_layers[col])
    n_num = len(NUMERICAL)
    numerical_input = Input(shape=(n_num,), name='numerical_input')
    preds = concatenate([embed_layers[col] for col in cat] + [numerical_input])
    
    # hidden layers
    preds = Dense(128, activation='relu', name='dnn_dense1')(preds)
    preds = Dense(64, activation='relu', name='dnn_dense2')(preds)
    preds = Dense(32, activation='relu', name='dnn_dense3')(preds)
    # dropout
    preds = Dropout(0.5)(preds)
    # output layer
    preds = Dense(1, activation='sigmoid')(preds)
    
    input_list = [input_layers[col] for col in cat] + [numerical_input]

    model = Model(inputs=input_list, outputs=preds)
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
    return model

def dnn_input(X):
    return [X[col] for col in cat] + [X.loc[:, NUMERICAL]]

def dnn_validate(model, X):
    preds_val = model.predict(dnn_input(X), batch_size=32768)
    val_auc = roc_auc_score(X['target'], preds_val)
    print(val_auc)
    return val_auc

def dnn_produce(model, val_auc):
    preds_test = model.predict(dnn_input(df_test), batch_size=32768, verbose=1)
    sub = pd.DataFrame({'id': df_test['id'], 'target': preds_test.ravel()})
    sub.to_csv('../result/sub_dnn_' + time.strftime("%Y-%m-%d_%H_%M_%S") + '_%.5f.csv.gz' %(val_auc), 
               compression = 'gzip', index=False)

In [11]:
dnn = dnn_model()
dnn.summary()

   source_system_tab	vocab:       10, embed:    4
  source_screen_name	vocab:       22, embed:    8
         source_type	vocab:       13, embed:    4
           genre_ids	vocab:      609, embed:   16
         artist_name	vocab:    46373, embed:   16
            composer	vocab:    86438, embed:   32
            lyricist	vocab:    37876, embed:   16
            language	vocab:       11, embed:    4
                city	vocab:       21, embed:    8
              gender	vocab:        3, embed:    2
      registered_via	vocab:        6, embed:    4
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
source_system_tab_input (InputL (None, 1)            0                                            
__________________________________________________________________________________________________
source_screen_name_input (Input (None, 1)            0

In [14]:
early_stopping = EarlyStopping(monitor='val_acc', patience=5) # early stop
dnn_path = '../model/keras/dnn_model.h5'
model_checkpoint = ModelCheckpoint(dnn_path, save_best_only=True, save_weights_only=True) # save the best model
# training
hist = dnn.fit(
    dnn_input(X_trn), X_trn['target'],
    validation_data=(dnn_input(X_val), X_val['target']), 
    epochs=100, batch_size=32768, shuffle=True,
    callbacks=[early_stopping, model_checkpoint]
)
# load the best model
dnn.load_weights(dnn_path)

Train on 6639676 samples, validate on 737742 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


In [15]:
# validate
val_auc = dnn_validate(dnn, X_val)
dnn_validate(dnn, X_last)
# generate the output for Kaggle submission
dnn_produce(dnn, val_auc)

0.735858533144
0.703385242235
