In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv', dtype={'msno': 'category',
                                        'source_system_tab': 'category',
                                        'source_screen_name': 'category',
                                        'source_type': 'category',
                                        'target': np.uint8,
                                        'song_id': 'category'})
test = pd.read_csv('test.csv', dtype={'msno': 'category',
                                        'source_system_tab': 'category',
                                        'source_screen_name': 'category',
                                        'source_type': 'category',
                                        'song_id': 'category'})
songs = pd.read_csv('songs.csv', dtype={'genre_ids': 'category',
                                        'language': 'category',
                                        'artist_name': 'category',
                                        'composer': 'category',
                                        'lyricist': 'category',
                                        'song_id': 'category'})
members = pd.read_csv('members.csv',dtype={'city' : 'category',
                                        'bd' : np.uint8,
                                        'gender' : 'category',
                                        'registered_via' : 'category'})
songs_extra = pd.read_csv('song_extra_info.csv')

In [3]:
train.isnull().sum()

msno                       0
song_id                    0
source_system_tab      24849
source_screen_name    414804
source_type            21539
target                     0
dtype: int64

In [4]:
members.isnull().sum()

msno                          0
city                          0
bd                            0
gender                    19902
registered_via                0
registration_init_time        0
expiration_date               0
dtype: int64

In [5]:
songs.isnull().sum()

song_id              0
song_length          0
genre_ids        94116
artist_name          0
composer       1071354
lyricist       1945268
language             1
dtype: int64

In [8]:
songs_extra.isnull().sum()

song_id         0
name            2
isrc       136548
dtype: int64

In [9]:
song_cols = ['song_id','artist_name','genre_ids','song_length','language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

In [10]:
members['registration_init_time'] = pd.to_datetime(members['registration_init_time'],format='%Y%m%d')

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_day'] = members['registration_init_time'].dt.day

In [11]:
members['expiration_date'] = pd.to_datetime(members['expiration_date'],format='%Y%m%d')

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day

In [12]:
members.drop(['registration_init_time','expiration_date'],axis=1,inplace=True)

In [13]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        np.nan

In [14]:
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc','name'], axis=1, inplace=True)

In [15]:
train = train.merge(members,on='msno',how='left')
test = test.merge(members,on='msno',how='left')

In [16]:
train = train.merge(songs_extra,on='song_id',how='left')
test = test.merge(songs_extra,on='song_id',how='left')

In [17]:
import gc
del members,songs; gc.collect();

In [18]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [19]:
X = train.drop(['target'],axis=1)
y = train['target'].values

In [20]:
X_test = test.drop(['id'],axis=1)
ids = test['id'].values

In [21]:
del train, test; gc.collect();

In [22]:
import lightgbm as lgb

In [23]:
d_train = lgb.Dataset(X, y)
watchlist = [d_train]

In [24]:
params = {}
params['learning_rate'] = 0.2
params['application'] = 'binary'
params['max_depth'] = 8
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

In [25]:
model = lgb.train(params,train_set=d_train,num_boost_round=50,valid_sets=watchlist)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [26]:
p_test = model.predict(X_test)

In [27]:
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test

In [28]:
subm

Unnamed: 0,id,target
0,0,0.654274
1,1,0.632100
2,2,0.229069
3,3,0.182519
4,4,0.185155
...,...,...
2556785,2556785,0.208441
2556786,2556786,0.440084
2556787,2556787,0.440084
2556788,2556788,0.415723


In [29]:
subm.to_csv('submission.csv',index=False,float_format='%.5f')