In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import xgboost as xgb

import pickle



In [2]:
data_path = '../data/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs1.csv')
members = pd.read_csv(data_path + 'members.csv')

song_cols = ['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',\
            'lyricist', 'language', 'song_name', 'song_country', 'song_year']

train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
member = members.drop(['registration_year', 'expiration_year'], axis=1)

members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')

train = train.fillna(-1)
test = test.fillna(-1)

In [13]:
for x in list(songs):
    print (x)

id
song_id
song_length
genre_ids
artist_name
composer
lyricist
language
song_name
isrc
song_country
song_year


In [15]:
train['artist_name'][:4]

0           Bastille
1    Various Artists
2                Nas
3           Soundway
Name: artist_name, dtype: object

In [3]:
#<class 'pandas.core.series.Series'> object
#pandas.core.frame.DataFrame

In [3]:
# Preprocess dataset
cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    try:
        if train[col].dtype == 'object':
            train[col] = train[col].apply(str)
            test[col] = test[col].apply(str)

            le = LabelEncoder()
            train_vals = list(train[col].unique())
            test_vals = list(test[col].unique())
            le.fit(train_vals + test_vals)
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

            print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))
    except:
        print (col, type(train[col]))
        break



  4%|▍         | 1/25 [00:23<09:26, 23.59s/it]

msno: 30755, 25131


  8%|▊         | 2/25 [01:23<15:57, 41.63s/it]

song_id: 359966, 224753


 12%|█▏        | 3/25 [01:40<12:20, 33.67s/it]

source_system_tab: 10, 10


 16%|█▌        | 4/25 [02:00<10:31, 30.08s/it]

source_screen_name: 21, 23


 20%|██        | 5/25 [02:21<09:26, 28.31s/it]

source_type: 13, 13


 28%|██▊       | 7/25 [02:41<06:56, 23.13s/it]

genre_ids: 573, 502


 32%|███▏      | 8/25 [03:32<07:32, 26.60s/it]

artist_name: 40583, 27564


 36%|███▌      | 9/25 [05:20<09:30, 35.65s/it]

composer: 76065, 52307


 40%|████      | 10/25 [05:57<08:55, 35.73s/it]

lyricist: 33890, 24911


 48%|████▊     | 12/25 [07:52<08:31, 39.37s/it]

song_name: 234112, 154708


 52%|█████▏    | 13/25 [08:12<07:34, 37.90s/it]

song_country: 110, 97


100%|██████████| 25/25 [08:27<00:00, 20.30s/it]

gender: 3, 3





In [6]:
print (train.head())
    

    msno  song_id  source_system_tab  source_screen_name  source_type  target  \
0   9176    86884                  2                   8            7       1   
1  19273   260594                  4                   9            5       1   
2  19273   140755                  4                   9            5       1   
3  19273    27577                  4                   9            5       1   
4   9176    38706                  2                   8            7       1   

   song_length  genre_ids  artist_name  composer        ...         bd  \
0     206471.0        308         3785     16654        ...          0   
1     284584.0         98        36868        71        ...         24   
2     225396.0         98        24602     51541        ...         24   
3     255512.0          7        31652     41992        ...         24   
4     187802.0          3         5191      9702        ...          0   

   gender  registered_via  registration_init_time  expiration_date  

In [19]:
print(train.head())
print(test.head())

X = np.array(train.drop(['target'], axis=1))
y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1))
ids = test['id'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, \
    test_size=0.2, random_state=0)


d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid) 
d_test = xgb.DMatrix(X_test)

eval_set = [(X_train, y_train), (X_valid, y_valid)]
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    msno  song_id  source_system_tab  source_screen_name  source_type  target  \
0   9176    86884                  2                   8            7       1   
1  19273   260594                  4                   9            5       1   
2  19273   140755                  4                   9            5       1   
3  19273    27577                  4                   9            5       1   
4   9176    38706                  2                   8            7       1   

   song_length  genre_ids  artist_name  composer        ...         bd  \
0     206471.0        308         3785     16654        ...          0   
1     284584.0         98        36868        71        ...         24   
2     225396.0         98        24602     51541        ...         24   
3     255512.0          7        31652     41992        ...         24   
4     187802.0          3         5191      9702        ...          0   

   gender  registered_via  registration_init_time  expiration_date  

In [None]:
for x in list(train)
    print (x)

In [20]:
# Train model, evaluate and make predictions
def model(eta, max_depth):
    params = {}
    params['objective'] = 'binary:logistic'
    params['eta'] = eta #0.7
    params['max_depth'] = max_depth #5
    params['silent'] = 1
    params['eval_metric'] = 'auc'
    params['subsample'] = 1.0 #1.0
    params['min_child_weight'] = 5 #5
    params['colsample_bytree'] = 0.2 # 0.2
    evals_result = {}
    
    model = xgb.train(params, d_train, 105, watchlist, early_stopping_rounds=20, \
        maximize=True, verbose_eval=10, evals_result = evals_result)

    p_test = model.predict(d_test)
    return [p_test, [evals_result['train']['auc'][-1], evals_result['valid']['auc'][-1]]]

The history saving thread hit an unexpected error (OperationalError('unable to open database file',)).History will not be written to the database.


In [58]:
etas = [0.01 * (2 ** x) for x in range(1, 8)] # 0.01 to 1.28
subsamples = [0.1 * x for x in range(5, 11)]
min_child_weights = [ x for x in range(2, 10)] 
colsample_bytrees = [0.1 * x for x in range(1, 11)]

In [91]:
p_tests = []
weights = []
for max_depth in range(15, 21):
    for eta in [0.3]:
        p_test, res = model(eta, max_depth)
        with open('%s_%s.pickle' % (eta*10, max_depth),\
                                      'wb')as handle:
            pickle.dump(p_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('%s_%s_res.pickle' % (eta*10, max_depth),\
                                      'wb')as handle:
            pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)
        p_tests += p_test,
        weights += res,

[0]	train-auc:0.672375	valid-auc:0.669357
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.739736	valid-auc:0.723105
[20]	train-auc:0.770706	valid-auc:0.746783
[30]	train-auc:0.798802	valid-auc:0.767644
[40]	train-auc:0.810921	valid-auc:0.773276
[50]	train-auc:0.823404	valid-auc:0.77987
[60]	train-auc:0.834247	valid-auc:0.785858
[70]	train-auc:0.843202	valid-auc:0.790514
[80]	train-auc:0.850653	valid-auc:0.79307
[90]	train-auc:0.857859	valid-auc:0.795769
[100]	train-auc:0.86311	valid-auc:0.79654
[0]	train-auc:0.67369	valid-auc:0.67017
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.749925	valid-auc:0.728443
[20]	train-auc:0.783072	valid-auc:0.752503
[30]	train-auc:0.813068	valid-auc:0.773582
[40]	train-auc:0.825832	valid-auc:0.77869
[50]	train-auc:0.839455	valid-auc:

In [92]:
weights = [x[1] for x in weights]
print (weights)

[0.79709, 0.801606, 0.804713, 0.805065, 0.806413, 0.806967]


In [93]:
total = np.sum(weights)
print (total)
result = (np.array(p_tests).T * np.array(weights)).T

4.821854


In [94]:
print (np.shape(result))
mat = np.sum(np.matrix(result), axis = 0)
p_test = [x / total for x in (mat.tolist()[0])]
print (np.shape(p_test))

(6, 2556790)
(2556790,)


In [95]:
for x in p_test[:10]:
    print (x)

0.575256256761
0.594704083322
0.0494060050799
0.164167855608
0.171569517369
0.0749725261829
0.172234587915
0.756745259223
0.0690728427169
0.755896027506


In [96]:
# with open('p_tests.pickle', 'wb') as handle:
#     pickle.dump(p_tests, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('weights.pickle', 'wb') as handle:
#     pickle.dump(weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [97]:
print (np.sum(p_test))

1124953.3455


In [98]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
print(len(ids), len(p_test))
subm.to_csv('../submissions/submission.csv', index=False)

2556790 2556790
