In [12]:
import os
import sys
import time
import json
from typing import Dict, List, Iterable

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import ast
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from datetime import datetime, timedelta
# from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import (DeepFM, DCN, DCNMix, xDeepFM, AFM, DIFM,
                                  basemodel)
from scipy import sparse
from scipy.sparse.linalg import svds
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint

from src import preprocess, generate_datasets
# from src.train import random_features

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")

## Preprocess

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
songs_df = pd.read_csv('../data/songs.csv')
songs_extra_df = pd.read_csv('../data/song_extra_info.csv')
members_df = pd.read_csv('../data/members.csv')

In [3]:
tr_song_msno_df, val_song_msno_df, ts_song_msno_df, item2idx = preprocess(
    train_df, test_df, songs_df, members_df)

In [3]:
train_test_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
# filter songs and members
songs_df = songs_df[songs_df['song_id'].isin(train_test_df['song_id'])]
songs_extra_df = songs_extra_df[songs_extra_df['song_id'].isin(
    train_test_df['song_id'])]
members_df = members_df[members_df['msno'].isin(train_test_df['msno'])]

### Songs

In [5]:
missing_song_ids = train_test_df[~train_test_df['song_id'].
                                 isin(songs_df['song_id'])][['song_id']]

In [6]:
songs_df = pd.concat((songs_df, missing_song_ids), ignore_index=True)

In [7]:
songs_df.columns

Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')

In [8]:
# encode song ids
song_id_encoder = LabelEncoder()
song_id_encoder.fit(train_test_df['song_id'])
songs_df['song_id'] = song_id_encoder.transform(songs_df['song_id'])
songs_extra_df['song_id'] = song_id_encoder.transform(
    songs_extra_df['song_id'])
train_df['song_id'] = song_id_encoder.transform(train_df['song_id'])
test_df['song_id'] = song_id_encoder.transform(test_df['song_id'])

In [9]:
# standardize song length
songlen_ss = StandardScaler()
songs_df['song_length'].fillna(songs_df['song_length'].median(), inplace=True)
songs_df['song_length'] = songlen_ss.fit_transform(
    songs_df['song_length'].to_numpy().reshape(-1, 1)).reshape(-1)

In [10]:
def map_idx(item_ls: List[str], item2idx: Dict[str, int]):
    for item in item_ls:
        if item not in item2idx:
            item2idx[item] = len(item2idx)

    return list(map(lambda item: item2idx[item], item_ls))

In [11]:
# genre ids
genre2idx = {'[PAD]': 0}
songs_df['genre_ids'].fillna('Unknown', inplace=True)
songs_df['genre_ids'] = songs_df['genre_ids'].astype(str).str.split('|')
songs_df['genre_ids'] = songs_df['genre_ids'].apply(map_idx,
                                                    item2idx=genre2idx)

In [12]:
def name_split(col_name: str):
    return songs_df[col_name].str.split(
        r"and|,|feat\.|featuring|&|\. |\||/|\\|;",
        regex=True).apply(lambda ls: list(
            map(lambda s: s.strip(), filter(lambda s: s.strip(), ls))))

In [13]:
songs_df['artist_name'].fillna("", inplace=True)
songs_df['composer'].fillna("", inplace=True)
songs_df['lyricist'].fillna("", inplace=True)
songs_df['is_featured'] = songs_df['artist_name'].apply(
    lambda x: 1 if 'feat' in str(x) else 0)

In [14]:
songs_df['artist_name'] = name_split('artist_name')
songs_df['composer'] = name_split('composer')
songs_df['lyricist'] = name_split('lyricist')

In [15]:
songs_df['num_artist'] = songs_df['artist_name'].apply(len)
songs_df['num_composer'] = songs_df['composer'].apply(len)
songs_df['num_lyricist'] = songs_df['lyricist'].apply(len)

In [16]:
for col in ['num_artist', 'num_composer', 'num_lyricist']:
    songs_df[col] = StandardScaler().fit_transform(
        songs_df[col].to_numpy().reshape(-1, 1)).reshape(-1)

In [17]:
artist2idx = {'[PAD]': 0}
songs_df['artist_name'] = songs_df['artist_name'].apply(map_idx,
                                                        item2idx=artist2idx)

In [18]:
for col in ['composer', 'lyricist']:
    songs_df[col] = LabelEncoder().fit_transform(
        songs_df[col].apply(lambda x: x[0] if x else np.nan))


In [19]:
songs_df['language'].fillna(-1, inplace=True)
songs_df['language'] = LabelEncoder().fit_transform(songs_df['language'])

In [59]:
item2idx = {'genre_ids': genre2idx, 'artist_name': artist2idx}

In [53]:
with open('../data/item2idx.json', 'w', encoding='utf-8') as f:
    json.dump(item2idx, f)

In [20]:
songs_df.to_csv('../data/prep_songs.csv', encoding='utf-8', index=False)

In [23]:
songs_df

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,is_featured,num_artist,num_composer,num_lyricist
0,95568,-0.007565,[1],[1],48018,17282,1,0,-0.176534,-0.049576,0.580873
1,341401,-0.345950,[2],[2],35491,13935,5,0,-0.176534,1.073085,0.580873
2,104696,-0.114229,[1],[3],51065,25802,5,0,-0.176534,-0.610907,-0.397917
3,271326,0.166725,[1],[4],46606,19715,1,0,-0.176534,-0.049576,0.580873
4,223595,-0.729310,[3],[5],37237,14608,8,0,-0.176534,-0.049576,0.580873
...,...,...,...,...,...,...,...,...,...,...,...
419915,249433,-0.104544,[15],[],51065,25802,0,0,-1.951470,-0.610907,-0.397917
419916,374537,-0.104544,[15],[],51065,25802,0,0,-1.951470,-0.610907,-0.397917
419917,168359,-0.104544,[15],[],51065,25802,0,0,-1.951470,-0.610907,-0.397917
419918,185731,-0.104544,[15],[],51065,25802,0,0,-1.951470,-0.610907,-0.397917


### Members

In [24]:
train_test_df[~train_test_df['msno'].isin(members_df['msno'])]

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,id


In [25]:
members_df.columns

Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date'],
      dtype='object')

In [26]:
# encode msno
msno_encoder = LabelEncoder()
msno_encoder.fit(members_df['msno'])
members_df['msno'] = msno_encoder.transform(members_df['msno'])
train_df['msno'] = msno_encoder.transform(train_df['msno'])
test_df['msno'] = msno_encoder.transform(test_df['msno'])

In [27]:
# age
members_df['bd'] = members_df['bd'].apply(lambda x: np.nan
                                          if x <= 5 or x >= 75 else x)
members_df['bd'].fillna(members_df['bd'].median(), inplace=True)
members_df['bd'] = MinMaxScaler().fit_transform(
    members_df['bd'].to_numpy().reshape(-1, 1)).reshape(-1)

In [28]:
# encode geograph info
columns = ['city', 'gender', 'registered_via']
for column in columns:
    column_encoder = LabelEncoder()
    members_df[column] = column_encoder.fit_transform(members_df[column])

In [29]:
# preprocess dates
members_df['registration_init_time'] = pd.to_datetime(
    members_df['registration_init_time'].astype(str))
members_df['expiration_date'] = pd.to_datetime(
    members_df['expiration_date'].astype(str))

In [30]:
members_df['registration_init_time'] = members_df[
    'registration_init_time'].apply(lambda x: np.nan
                                    if x < datetime(2000, 1, 1) else x)
members_df['registration_init_time'].fillna(
    members_df['registration_init_time'].min(), inplace=True)

In [31]:
members_df.loc[members_df['expiration_date'] <
               members_df['registration_init_time'],
               'expiration_date'] = np.nan
members_df['expiration_date'].fillna(members_df['expiration_date'].max(),
                                     inplace=True)

In [32]:
dur_mm = MinMaxScaler()
dur_col = (members_df['expiration_date'] - members_df['registration_init_time']
           ).apply(lambda x: 0 if x < timedelta(0) else x.days).fillna(0)
members_df['duration'] = dur_mm.fit_transform(dur_col.to_numpy().reshape(
    -1, 1)).reshape(-1)

In [33]:
members_df['registration_init_time'] = members_df[
    'registration_init_time'].apply(lambda x: x.timestamp())
members_df['registration_init_time'] = MinMaxScaler().fit_transform(
    members_df['registration_init_time'].to_numpy().reshape(-1, 1)).reshape(-1)
members_df['expiration_date'] = members_df['expiration_date'].apply(
    lambda x: x.timestamp())
members_df['expiration_date'] = MinMaxScaler().fit_transform(
    members_df['expiration_date'].to_numpy().reshape(-1, 1)).reshape(-1)

In [34]:
members_df.to_csv('../data/prep_members.csv', encoding='utf-8', index=False)

In [35]:
members_df

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date,duration
0,19016,0,0.298507,2,2,0.572427,0.807903,0.431734
1,17476,0,0.298507,2,2,0.870606,0.792508,0.140804
2,7977,0,0.298507,2,1,0.931597,0.795929,0.088755
3,27021,0,0.298507,2,3,0.885430,0.680636,0.000194
4,29129,0,0.298507,2,1,0.993011,0.790968,0.026801
...,...,...,...,...,...,...,...,...
34398,18725,0,0.298507,2,2,0.744812,0.806192,0.271703
34399,23698,2,0.164179,1,0,0.818297,0.786521,0.181977
34400,10811,0,0.298507,2,2,0.723422,0.805850,0.290930
34401,18326,0,0.298507,2,2,0.894748,0.807903,0.136143


### Train Test

In [40]:
train_test_df.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'id'],
      dtype='object')

In [41]:
# encode source*
columns = ['source_system_tab', 'source_screen_name', 'source_type']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(train_test_df[column])
    train_df[column] = column_encoder.transform(train_df[column])
    test_df[column] = column_encoder.transform(test_df[column])

In [42]:
tr_df, val_df = train_test_split(train_df, test_size=0.2, shuffle=False)

In [43]:
tr_song_df = tr_df.merge(songs_df, how='left', on='song_id')
tr_song_msno_df = tr_song_df.merge(members_df, how='left', on='msno')
val_song_df = val_df.merge(songs_df, how='left', on='song_id')
val_song_msno_df = val_song_df.merge(members_df, how='left', on='msno')
ts_song_df = test_df.merge(songs_df, how='left', on='song_id')
ts_song_msno_df = ts_song_df.merge(members_df, how='left', on='msno')

In [50]:
tr_song_msno_df.to_csv('../data/prep_tr_song_msno.csv',
                       encoding='utf-8',
                       index=False)
val_song_msno_df.to_csv('../data/prep_val_song_msno.csv',
                        encoding='utf-8',
                        index=False)
ts_song_msno_df.to_csv('../data/prep_ts_song_msno.csv',
                       encoding='utf-8',
                       index=False)

## Construct dataset

In [2]:
tr_song_msno_df = pd.read_csv("../data/prep_tr_song_msno.csv")
val_song_msno_df = pd.read_csv("../data/prep_val_song_msno.csv")
ts_song_msno_df = pd.read_csv("../data/prep_ts_song_msno.csv")

with open('../data/item2idx.json', 'r', encoding='utf-8') as f:
    item2idx = json.load(f)

In [83]:
bl_members_df = pd.read_csv("../data/final/members_add.csv")
bl_train_df = pd.read_csv("../data/final/train_add.csv")
bl_test_df = pd.read_csv("../data/final/test_add.csv")

In [91]:
full_df = pd.concat([bl_train_df, bl_test_df], ignore_index=True)

In [94]:
sparse_features = ['source']
dense_features = list(bl_train_df.columns)
dense_features.remove('source')
fixlen_feature_columns = [
    SparseFeat(feat, len(full_df[feat].unique()), embedding_dim=64)
    for feat in sparse_features
] + [DenseFeat(feat, 1) for feat in dense_features]

In [95]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [97]:
tr_df, val_df = train_test_split(bl_train_df, test_size=0.2, shuffle=False)

In [98]:
tr_model_input = {name: tr_df[name] for name in feature_names}
val_model_input = {name: val_df[name] for name in feature_names}

In [3]:
full_df = pd.concat([tr_song_msno_df, val_song_msno_df, ts_song_msno_df],
                    ignore_index=True)

In [4]:
sparse_features = [
    'msno',
    'song_id',
    'source_system_tab',
    'source_screen_name',
    'source_type',
    'composer',
    'lyricist',
    'language',
    'is_featured',
    'city',
    'registered_via',
    'gender',
]
dense_features = [
    'song_length',
    'num_artist',
    'num_composer',
    'num_lyricist',
    'bd',
    'registration_init_time',
    'expiration_date',
    'duration',
]
varlen_features = [
    'genre_ids',
    'artist_name',
]
embed_dim = 64

In [20]:
random_features(sparse_features)

['gender',
 'song_id',
 'registered_via',
 'source_screen_name',
 'source_system_tab',
 'source_type',
 'is_featured']

In [5]:
(tr_model_input, val_model_input, ts_model_input, linear_feature_columns,
 dnn_feature_columns) = generate_datasets(tr_song_msno_df, val_song_msno_df,
                                          ts_song_msno_df, sparse_features,
                                          dense_features, varlen_features,
                                          item2idx)

In [6]:
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5,
               dnn_dropout=0.3,
               l2_reg_dnn=1e-5,
               dnn_use_bn=True,
               device=DEVICE)
es = EarlyStopping(monitor='val_auc',
                   min_delta=0,
                   verbose=1,
                   patience=1,
                   mode='max')
os.makedirs('../checkpoints/DeepFM/', exist_ok=True)
mdckpt = ModelCheckpoint(filepath='../checkpoints/DeepFM/model.ckpt',
                         monitor='val_auc',
                         verbose=1,
                         save_best_only=True,
                         mode='max')

In [7]:
model.compile(
    # optimizer="adam",
    optimizer=optim.RMSprop(model.parameters(), lr=1e-3),
    loss="binary_crossentropy",
    metrics=['auc'])
history = model.fit(tr_model_input,
                    tr_song_msno_df['target'].values,
                    batch_size=8192,
                    epochs=10,
                    verbose=2,
                    validation_data=(val_model_input,
                                     val_song_msno_df['target'].values),
                    shuffle=True,
                    callbacks=[es, mdckpt])

cuda
Train on 5901934 samples, validate on 1475571 samples, 721 steps per epoch
Epoch 1/10
59s - loss:  0.5474 - auc:  0.7925 - val_auc:  0.6809
Epoch 00001: val_auc improved from -inf to 0.68093, saving model to ../checkpoints/DeepFM/model.ckpt
Epoch 2/10
58s - loss:  0.4996 - auc:  0.8360 - val_auc:  0.6820
Epoch 00002: val_auc improved from 0.68093 to 0.68203, saving model to ../checkpoints/DeepFM/model.ckpt
Epoch 3/10
59s - loss:  0.4584 - auc:  0.8643 - val_auc:  0.6763
Epoch 00003: val_auc did not improve from 0.68203
Epoch 4/10
60s - loss:  0.4225 - auc:  0.8858 - val_auc:  0.6697
Epoch 00004: val_auc did not improve from 0.68203
Epoch 00004: early stopping


In [14]:
mdckpt.best

0.6820330033615547

In [75]:
fixlen_feature_columns = [
    SparseFeat(feat, len(full_df[feat].unique()), embedding_dim=embed_dim)
    for feat in sparse_features
] + [DenseFeat(feat, 1) for feat in dense_features]

In [76]:
varlen_feature_columns = [
    VarLenSparseFeat(SparseFeat(feat,
                                vocabulary_size=len(item2idx[feat]),
                                embedding_dim=embed_dim),
                     maxlen=max(full_df[feat].apply(len)),
                     combiner='mean') for feat in varlen_features
]

In [77]:
linear_feature_columns = fixlen_feature_columns \
            + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [78]:
tr_model_input = {name: tr_song_msno_df[name] for name in feature_names}
val_model_input = {name: val_song_msno_df[name] for name in feature_names}
ts_model_input = {name: ts_song_msno_df[name] for name in feature_names}

for feat in varlen_features:
    tr_model_input[feat] = pad_sequences(tr_song_msno_df[feat],
                                         maxlen=max(full_df[feat].apply(len)),
                                         padding='post')
    val_model_input[feat] = pad_sequences(val_song_msno_df[feat],
                                          maxlen=max(full_df[feat].apply(len)),
                                          padding='post')
    ts_model_input[feat] = pad_sequences(ts_song_msno_df[feat],
                                         maxlen=max(full_df[feat].apply(len)),
                                         padding='post')

In [6]:
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5,
               dnn_dropout=0.3,
               l2_reg_dnn=1e-5,
               dnn_use_bn=True,
               device=DEVICE)

In [7]:
model.compile(
    optimizer="adam",
    # optimizer=optim.RMSprop(model.parameters(), lr=1e-3),
    loss="binary_crossentropy",
    metrics=['auc'])

In [8]:
history = model.fit(tr_model_input,
                    tr_song_msno_df['target'].values,
                    batch_size=8192,
                    epochs=1,
                    verbose=2,
                    validation_data=(val_model_input,
                                     val_song_msno_df['target'].values),
                    shuffle=True)

cuda
Train on 5902311 samples, validate on 1475571 samples, 721 steps per epoch
Epoch 1/1
62s - loss:  0.5563 - auc:  0.7837 - val_auc:  0.6790


In [None]:
# Best 0.6834
sparse_features = [
    'msno',
    'song_id',
    'source_system_tab',
    'source_screen_name',
    'source_type',
    # 'composer',
    # 'lyricist',
    'language',
    'is_featured',
    'city',
    'registered_via',
    'gender',
]
dense_features = [
    'song_length',
    # 'num_artist',
    # 'num_composer',
    # 'num_lyricist',
    'bd',
    'registration_init_time',
    'expiration_date',
    # 'duration',
]
varlen_features = [
    'genre_ids',
    'artist_name',
]

In [6]:
model_ls = []
model_fp = '../checkpoints/DeepFM'
for model in os.listdir(model_fp):
    if model.endswith('ckpt'):
        model_ls.append(torch.load(os.path.join(model_fp, model)))

In [10]:
def get_prediction(model_input):

    pred_dict = {}
    for i, model in enumerate(model_ls):
        pred_val = model.predict(model_input, batch_size=8192).reshape(-1)
        pred_dict[f"pred{i}"] = pred_val

    return pred_dict

In [11]:
tr_pred_dict = get_prediction(tr_model_input)
val_pred_dict = get_prediction(val_model_input)
ts_pred_dict = get_prediction(ts_model_input)

In [14]:
tr_pred_stack = np.stack(list(tr_pred_dict.values()), axis=1)
val_pred_stack = np.stack(list(val_pred_dict.values()), axis=1)

In [25]:
meta_model = LogisticRegression()
meta_model.fit(val_pred_stack, val_song_msno_df['target'])

In [26]:
prediction = meta_model.predict_proba(val_pred_stack)

In [27]:
roc_auc_score(val_song_msno_df['target'], prediction[:, 1])

0.6879797211683798

In [65]:
pred_ts = np.sum(list(pred_dict.values()), axis=0) / len(pred_dict)

In [77]:
ts_song_msno_df['target'] = pred_ts

In [78]:
output_df = ts_song_msno_df[['id', 'target']].drop_duplicates('id')

In [81]:
output_df.to_csv('test_prediction.csv', index=False, encoding='utf-8')

In [8]:
pd.DataFrame(pred_dict).corr()

Unnamed: 0,pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15
pred0,1.0,0.92395,0.887886,0.910228,0.919125,0.928767,0.916705,0.928943,0.915842,0.93422,0.93225,0.936022,0.938217,0.914521,0.927039,0.94096
pred1,0.92395,1.0,0.882249,0.913638,0.912756,0.935148,0.910476,0.932002,0.911478,0.923777,0.924025,0.929325,0.93007,0.898999,0.931192,0.922771
pred2,0.887886,0.882249,1.0,0.922365,0.924563,0.882732,0.925386,0.883554,0.923988,0.891195,0.882182,0.88368,0.884599,0.910313,0.8773,0.883791
pred3,0.910228,0.913638,0.922365,1.0,0.945177,0.917672,0.945584,0.915266,0.945189,0.910808,0.90609,0.906232,0.905534,0.931195,0.913895,0.91064
pred4,0.919125,0.912756,0.924563,0.945177,1.0,0.915142,0.950613,0.915165,0.949154,0.927371,0.916619,0.919545,0.917467,0.938479,0.914655,0.919246
pred5,0.928767,0.935148,0.882732,0.917672,0.915142,1.0,0.914367,0.941637,0.911621,0.926018,0.923274,0.921647,0.925454,0.899524,0.933576,0.931777
pred6,0.916705,0.910476,0.925386,0.945584,0.950613,0.914367,1.0,0.915383,0.948048,0.92404,0.911979,0.91551,0.914993,0.935914,0.913513,0.919866
pred7,0.928943,0.932002,0.883554,0.915266,0.915165,0.941637,0.915383,1.0,0.913324,0.92788,0.924361,0.925214,0.926504,0.901631,0.934167,0.935037
pred8,0.915842,0.911478,0.923988,0.945189,0.949154,0.911621,0.948048,0.913324,1.0,0.916979,0.910356,0.91646,0.912442,0.935406,0.910429,0.915969
pred9,0.93422,0.923777,0.891195,0.910808,0.927371,0.926018,0.92404,0.92788,0.916979,1.0,0.932475,0.93473,0.935523,0.91127,0.92533,0.938716


In [9]:
auc_dict = {}
for k, v in pred_dict.items():
    auc_dict[k] = roc_auc_score(val_song_msno_df['target'], v)
auc_dict['bagging'] = roc_auc_score(
    val_song_msno_df['target'],
    np.sum(list(pred_dict.values()), axis=0) / len(pred_dict))
auc_dict

{'pred0': 0.681809666577856,
 'pred1': 0.6805578354020632,
 'pred2': 0.6762644148157834,
 'pred3': 0.6806454624686432,
 'pred4': 0.6814876293453834,
 'pred5': 0.6807251935020673,
 'pred6': 0.6818436672593257,
 'pred7': 0.6804328971024493,
 'pred8': 0.6803450341404652,
 'pred9': 0.6804955639062039,
 'pred10': 0.6807305193884721,
 'pred11': 0.6815401223039594,
 'pred12': 0.6833899374850628,
 'pred13': 0.6801770102061063,
 'pred14': 0.6803899605533248,
 'pred15': 0.6820146260052102,
 'bagging': 0.6876708988939828}

In [23]:
model1.evaluate(val_model_input, val_song_msno_df['target'], batch_size=8192)

{'auc': 0.6833899374850628}