In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import xgboost as xgb

In [3]:
data_path = '../data/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv')

song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
members['duration'] = (members['expiration_year'] - members['registration_year']) * 366\
                    + (members['expiration_month'] - members['registration_month']) * 31\
                    + (members['expiration_date'] - members['registration_date'])
member = members.drop(['registration_year', 'expiration_year'], axis=1)

members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')

train = train.fillna(-1)
test = test.fillna(-1)

# Preprocess dataset
cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

        print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))

print(train.head())
print(test.head())

X = np.array(train.drop(['target'], axis=1))
y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1))
ids = test['id'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, \
    test_size=0.2, random_state=0)

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid) 
d_test = xgb.DMatrix(X_test)

eval_set = [(X_train, y_train), (X_valid, y_valid)]
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train model, evaluate and make predictions
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.75
params['max_depth'] = 16
params['silent'] = 1
params['eval_metric'] = 'auc'

model = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=20, \
    maximize=True, verbose_eval=5)

p_test = model.predict(d_test)

# Prepare submission
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
print(len(ids), len(p_test))
subm.to_csv('submission.csv', index=False)

  5%|▍         | 1/21 [00:24<08:14, 24.71s/it]

msno: 30755, 25131


 10%|▉         | 2/21 [01:24<13:22, 42.23s/it]

song_id: 359966, 224753


 14%|█▍        | 3/21 [01:42<10:15, 34.18s/it]

source_system_tab: 10, 10


 19%|█▉        | 4/21 [02:03<08:46, 30.95s/it]

source_screen_name: 21, 23


 24%|██▍       | 5/21 [02:23<07:38, 28.65s/it]

source_type: 13, 13


 29%|██▊       | 6/21 [02:53<07:13, 28.92s/it]

artist_name: 40583, 27564


 33%|███▎      | 7/21 [03:11<06:22, 27.34s/it]

genre_ids: 573, 502


100%|██████████| 21/21 [03:24<00:00,  9.74s/it]

gender: 3, 3
    msno  song_id  source_system_tab  source_screen_name  source_type  target  \
0   9176    86884                  2                   8            7       1   
1  19273   260594                  4                   9            5       1   
2  19273   140755                  4                   9            5       1   
3  19273    27577                  4                   9            5       1   
4   9176    38706                  2                   8            7       1   

   artist_name  genre_ids  song_length  language    ...     gender  \
0         3785        308     206471.0      52.0    ...          0   
1        36868         98     284584.0      52.0    ...          1   
2        24602         98     225396.0      52.0    ...          1   
3        31652          7     255512.0      -1.0    ...          1   
4         5191          3     187802.0      52.0    ...          0   

   registered_via  registration_init_time  expiration_date  registration_year  




[0]	train-auc:0.723997	valid-auc:0.709273
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[5]	train-auc:0.798951	valid-auc:0.757861
[10]	train-auc:0.848383	valid-auc:0.789145
[15]	train-auc:0.86581	valid-auc:0.791084
[20]	train-auc:0.884339	valid-auc:0.795012
[25]	train-auc:0.898766	valid-auc:0.795542
[30]	train-auc:0.909542	valid-auc:0.796093
[35]	train-auc:0.919392	valid-auc:0.795387
[40]	train-auc:0.927595	valid-auc:0.795019
[45]	train-auc:0.93485	valid-auc:0.793741
[50]	train-auc:0.941913	valid-auc:0.793438
Stopping. Best iteration:
[31]	train-auc:0.911176	valid-auc:0.796266

2556790 2556790
