In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import xgboost as xgb



In [3]:
data_path = '../data/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv')

song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
member = members.drop(['registration_year', 'expiration_year'], axis=1)

members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')

train = train.fillna(-1)
test = test.fillna(-1)

# Preprocess dataset
cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

        print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))

print(train.head())
print(test.head())

X = np.array(train.drop(['target'], axis=1))
y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1))
ids = test['id'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, \
    test_size=0.2, random_state=0)

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid) 
d_test = xgb.DMatrix(X_test)

eval_set = [(X_train, y_train), (X_valid, y_valid)]
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train model, evaluate and make predictions
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.7
params['max_depth'] = 10
params['silent'] = 1
params['eval_metric'] = 'auc'

model = xgb.train(params, d_train, 105, watchlist, early_stopping_rounds=20, \
    maximize=True, verbose_eval=10)

p_test = model.predict(d_test)

# Prepare submission
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
print(len(ids), len(p_test))
subm.to_csv('submission.csv', index=False)

  5%|▌         | 1/20 [00:42<13:19, 42.07s/it]

msno: 30755, 25131


 10%|█         | 2/20 [02:15<20:18, 67.71s/it]

song_id: 359966, 224753


 15%|█▌        | 3/20 [02:42<15:18, 54.00s/it]

source_system_tab: 10, 10


 20%|██        | 4/20 [03:12<12:48, 48.06s/it]

source_screen_name: 21, 23


 25%|██▌       | 5/20 [03:41<11:03, 44.21s/it]

source_type: 13, 13


 30%|███       | 6/20 [04:32<10:36, 45.43s/it]

artist_name: 40583, 27564


 35%|███▌      | 7/20 [04:59<09:16, 42.78s/it]

genre_ids: 573, 502


100%|██████████| 20/20 [05:19<00:00, 15.99s/it]

gender: 3, 3
    msno  song_id  source_system_tab  source_screen_name  source_type  target  \
0   9176    86884                  2                   8            7       1   
1  19273   260594                  4                   9            5       1   
2  19273   140755                  4                   9            5       1   
3  19273    27577                  4                   9            5       1   
4   9176    38706                  2                   8            7       1   

   artist_name  genre_ids  song_length  language        ...         bd  \
0         3785        308     206471.0      52.0        ...          0   
1        36868         98     284584.0      52.0        ...         24   
2        24602         98     225396.0      52.0        ...         24   
3        31652          7     255512.0      -1.0        ...         24   
4         5191          3     187802.0      52.0        ...          0   

   gender  registered_via  registration_init_time  expi




[0]	train-auc:0.677461	valid-auc:0.676231
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.729333	valid-auc:0.723042
[20]	train-auc:0.755395	valid-auc:0.745264
[30]	train-auc:0.772012	valid-auc:0.758588
[40]	train-auc:0.784599	valid-auc:0.768582
[50]	train-auc:0.792818	valid-auc:0.773607
[60]	train-auc:0.800681	valid-auc:0.778979
[70]	train-auc:0.80708	valid-auc:0.783057
[80]	train-auc:0.81297	valid-auc:0.78659
[90]	train-auc:0.818247	valid-auc:0.789474
[100]	train-auc:0.822554	valid-auc:0.791283
2556790 2556790
