In [5]:
#引用
import numpy as np
import pandas as pd
import lightgbm as lgb

print('Loading data...')
data_path = './data/'
train = pd.read_csv(data_path + 'train.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})
test = pd.read_csv(data_path + 'test.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})
songs = pd.read_csv(data_path + 'songs.csv',dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})
members = pd.read_csv(data_path + 'members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'})
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')

#預處理
print('Data preprocessing...')
song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
members = members.drop(['registration_init_time'], axis=1)

#計算歌曲年份
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
test = test.merge(songs_extra, on = 'song_id', how = 'left')

#Garbage collection 還不清楚
import gc
del members, songs; gc.collect();

for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

X = train.drop(['target'], axis=1)
y = train['target'].values

X_test = test.drop(['id'], axis=1)
ids = test['id'].values

del train, test; gc.collect();

d_train = lgb.Dataset(X, y)
watchlist = [d_train]

#模型參數 可以自己調一下
#light gbm基於決策樹
print('Training LGBM model...')
params = {}
params['learning_rates']=0.2
params['application'] = 'binary'
params['max_depth'] = 8          #最多長幾層 太深容易overfitting
params['num_leaves'] = 2**8      #樹的葉子數量
params['verbosity'] = 0
params['metric'] = 'auc'

model = lgb.train(params, train_set=d_train, num_boost_round=200, valid_sets=watchlist, verbose_eval=5)

#預測
print('Making predictions and saving them...')
p_test = model.predict(X_test)

#輸出
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
subm.to_csv('submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
print('Done!')

Loading data...
Data preprocessing...
Training LGBM model...
[5]	training's auc: 0.695636
[10]	training's auc: 0.71049
[15]	training's auc: 0.7176
[20]	training's auc: 0.722873
[25]	training's auc: 0.726978
[30]	training's auc: 0.730433
[35]	training's auc: 0.733264
[40]	training's auc: 0.735801
[45]	training's auc: 0.737898
[50]	training's auc: 0.739522
[55]	training's auc: 0.740636
[60]	training's auc: 0.7424
[65]	training's auc: 0.743485
[70]	training's auc: 0.744442
[75]	training's auc: 0.745609
[80]	training's auc: 0.746808
[85]	training's auc: 0.747932
[90]	training's auc: 0.749114
[95]	training's auc: 0.750269
[100]	training's auc: 0.75177
[105]	training's auc: 0.752671
[110]	training's auc: 0.754021
[115]	training's auc: 0.754611
[120]	training's auc: 0.755571
[125]	training's auc: 0.7563
[130]	training's auc: 0.757008
[135]	training's auc: 0.757725
[140]	training's auc: 0.758469
[145]	training's auc: 0.759108
[150]	training's auc: 0.759756
[155]	training's auc: 0.760413
[160]	