In [7]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
get_ipython().magic('env CUDA_VISIBLE_DEVICES =  ')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path
import random
import time
from collections import OrderedDict
import io
from datetime import datetime
import gc # garbage collector
import sklearn
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import math
import sys
from collections import defaultdict
import re
import logging
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')

env: CUDA_VISIBLE_DEVICES=
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Write a pandas dataframe to disk as gunzip compressed csv
- df.to_csv('dfsavename.csv.gz', compression='gzip')

## Read from disk
- df = pd.read_csv('dfsavename.csv.gz', compression='gzip')

## Magic useful
- %%timeit for the whole cell
- %timeit for the specific line
- %%latex to render the cell as a block of latex
- %prun and %%prun

In [8]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/WSDM/'
HDF_FILENAME = DATASET_PATH + 'datas.h5'
SUBMISSION_FILENAME = DATASET_PATH + 'submission_{}.csv'
VALIDATION_INDICE = DATASET_PATH + 'validation_indice.csv'

In [9]:
def set_logging(logger_name, logger_file_name):
    log = logging.getLogger(logger_name)
    log.setLevel(logging.DEBUG)

    # create formatter and add it to the handlers
    print_formatter = logging.Formatter('%(message)s')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s_%(levelname)s: %(message)s')

    # create file handler which logs even debug messages
    fh = logging.FileHandler(logger_file_name, mode='w')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(file_formatter)
    log.addHandler(fh)
    # both output to console and file
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(print_formatter)
    log.addHandler(consoleHandler)
    
    return log

In [10]:
log = set_logging('MUSIC', DATASET_PATH + 'music_gbm.log')
log.info('here is an info message.')

here is an info message.
here is an info message.


In [11]:
# TRAIN_FILE = DATASET_PATH + 'train.csv'
# TEST_FILE = DATASET_PATH + 'test.csv'
# MEMBER_FILE = DATASET_PATH + 'members.csv'
# SONG_FILE = DATASET_PATH + 'fix_songs.csv'
# SONG_EXTRA_FILE = DATASET_PATH + 'song_extra_info.csv'

# train_data = pd.read_csv(TRAIN_FILE)
# test_data = pd.read_csv(TEST_FILE)
# member_data = pd.read_csv(MEMBER_FILE)
# song_data = pd.read_csv(SONG_FILE)
# song_extra_data = pd.read_csv(SONG_EXTRA_FILE)

# songs_all = pd.merge(left = song_data, right = song_extra_data, how = 'left', on='song_id')
# train_with_mem = pd.merge(left = train_data, right = member_data, how = 'left', on='msno')
# train_all = pd.merge(left = train_with_mem, right = songs_all, how = 'left', on='song_id')
# test_with_mem = pd.merge(left = test_data, right = member_data, how = 'left', on='msno')
# test_all = pd.merge(left = test_with_mem, right = songs_all, how = 'left', on='song_id')
# del train_with_mem, test_with_mem; gc.collect()

# def convert_unicode_to_str(df):
#     df.columns = df.columns.astype(str)
#     types = df.apply(lambda x: pd.api.types.infer_dtype(df.values))
#     #print(types)#mixed-integer
#     for col in types[types == 'mixed-integer'].index:
#         df[col] = df[col].astype(str)
#     for col in types[types == 'mixed'].index:
#         df[col] = df[col].astype(str)
#     return df

# store = pd.HDFStore(HDF_FILENAME)
# store['train_data'] = convert_unicode_to_str(train_all)
# store['test_data'] = convert_unicode_to_str(test_all)
# store['song_data'] = convert_unicode_to_str(songs_all)
# store['test_id'] = test_data.id
# store.close()

In [12]:
# store_test = pd.HDFStore(HDF_FILENAME)
# train = store_test['train_data'][0:100]
# test = store_test['test_data'][0:100]
# test_id =  store_test['test_id'][0:100]
# store_test.close()
store_test = pd.HDFStore(HDF_FILENAME)
train = store_test['train_data']
test = store_test['test_data']
test_id =  store_test['test_id']
store_test.close()

In [13]:
def split_country(input_data):
    def get_country(isrc):
        if isinstance(isrc, str) and isrc != 'nan':
            return isrc[0:2]
        else:
            return np.nan
    countries = train['isrc'].apply(get_country)
    country_list = list(countries.value_counts().index)
    country_map = dict(zip(country_list, country_list))
    country_map['QM'] = 'QZ'
    country_map['US'] = 'QZ'
    return countries.map(country_map)

In [14]:
train['country'] = split_country(train)
test['country'] = split_country(test)

In [15]:
def isrc_to_year(isrc):
    if isinstance(isrc, str) and isrc != 'nan':
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
train['song_year'] = train['isrc'].apply(isrc_to_year)
test['song_year'] = test['isrc'].apply(isrc_to_year)
train.drop(['isrc'], axis = 1, inplace = True)
test.drop(['isrc'], axis = 1, inplace = True)

In [16]:
def split_reg_date(input_data):
    input_data['registration_year'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[0:4]))
    input_data['registration_year'] = pd.to_numeric(input_data['registration_year'], downcast='unsigned')

    input_data['registration_month'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[4:6]))
    input_data['registration_month'] = pd.to_numeric(input_data['registration_month'], downcast='unsigned')

    input_data['registration_day'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[6:8]))
    input_data['registration_day'] = pd.to_numeric(input_data['registration_day'], downcast='unsigned')

    return input_data

In [17]:
def split_expir_date(input_data):
    input_data['expiration_year'] = input_data['expiration_date'].apply(lambda x : int(str(x)[0:4]))
    input_data['expiration_year'] = pd.to_numeric(input_data['expiration_year'], downcast='unsigned')

    input_data['expiration_month'] = input_data['expiration_date'].apply(lambda x : int(str(x)[4:6]))
    input_data['expiration_month'] = pd.to_numeric(input_data['expiration_month'], downcast='unsigned')

    input_data['expiration_day'] = input_data['expiration_date'].apply(lambda x : int(str(x)[6:8]))
    input_data['expiration_day'] = pd.to_numeric(input_data['expiration_day'], downcast='unsigned')
    
    return input_data

In [18]:
def date_to_day(input_data):
    # 转换注册时间
    input_data['registration_init_time'] = pd.to_datetime(input_data['registration_init_time'],format="%Y%m%d")
    input_data['expiration_date'] = pd.to_datetime(input_data['expiration_date'],format="%Y%m%d")
    days = input_data.expiration_date - input_data.registration_init_time
    days = [d.days for d in days]
    input_data['days']=days
    
    return input_data

In [19]:
train = split_reg_date(train)
test = split_reg_date(test)
train = split_expir_date(train)
test = split_expir_date(test)

train = date_to_day(train)
test = date_to_day(test)

train.drop('registration_init_time',axis=1,inplace=True)
train.drop('expiration_date',axis=1,inplace=True)
test.drop('registration_init_time',axis=1,inplace=True)
test.drop('expiration_date',axis=1,inplace=True)

In [20]:
train['song_length'] = pd.to_numeric(train['song_length'].replace('nan', '235415'), downcast='unsigned')
test['song_length'] = pd.to_numeric(test['song_length'].replace('nan', '235415'), downcast='unsigned')

In [21]:
for col in train.columns: print(col, ':', train[col].dtype)

msno : object
song_id : object
source_system_tab : object
source_screen_name : object
source_type : object
target : object
city : object
bd : object
gender : object
registered_via : object
song_length : uint32
genre_ids : object
artist_name : object
composer : object
lyricist : object
language : object
name : object
country : object
song_year : float64
registration_year : uint16
registration_month : uint8
registration_day : uint8
expiration_year : uint16
expiration_month : uint8
expiration_day : uint8
days : int64


In [22]:
for col in [col for col in test.columns if col != 'id' ]:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [23]:
# # encode registered_via, the less number of occurrences are merged into the top item which has the max number of occurrences
# registered_via_hist = pd.concat([train['registered_via'], test['registered_via']], axis = 0).value_counts()
# registered_via_map = dict(zip(registered_via_hist.index, [int(s) for s in registered_via_hist.index.values]))
# registered_via_map[registered_via_hist.index[-1]] = int(str(registered_via_hist.index.values[0]))
# train['registered_via'] = train['registered_via'].map(registered_via_map)
# test['registered_via'] = test['registered_via'].map(registered_via_map)

In [24]:
# # encode language, fill nan with most occurrences item
# language_hist = pd.concat([train['language'], test['language']], axis = 0).value_counts()
# language_map = dict(zip(language_hist.index, [int(float(s)) for s in language_hist.index.values if s != 'nan']))
# language_map['nan'] = int(float(str(language_hist.index.values[0])))
# train['language'] = train['language'].map(language_map)
# test['language'] = test['language'].map(language_map)

In [25]:
# # encode country, fill nan with most occurrences item
# country_hist = pd.concat([train['country'], test['country']], axis = 0).value_counts()
# merge_per = 0.25
# country_map = dict(zip(country_hist.index, list(range(len(country_hist)))))
# for key in list(country_hist[-int(len(country_hist)*merge_per):].index):
#     country_map[key] = int(len(country_hist)*(1-merge_per)) + 1
# train['country'] = train['country'].map(country_map)
# test['country'] = test['country'].map(country_map)

In [26]:
# msno : category ; uinque values: 30755
# song_id : category ; uinque values: 359966
# - source_system_tab : category ; uinque values: 10
# - source_screen_name : category ; uinque values: 21
# - source_type : category ; uinque values: 13
# - target : object ; uinque values: 2
# - city : category ; uinque values: 21
# - bd : category ; uinque values: 92
# - gender : category ; uinque values: 3
# - registered_via : category ; uinque values: 5
# song_length : uint32 ; uinque values: 60271
# genre_ids : category ; uinque values: 573
# artist_name : category ; uinque values: 40587
# composer : category ; uinque values: 76072
# lyricist : category ; uinque values: 33895
# - language : category ; uinque values: 11
# name : category ; uinque values: 234144
# - country : category ; uinque values: 107
# - song_year : float64 ; uinque values: 100
# - registration_year : uint16 ; uinque values: 14
# - registration_month : uint8 ; uinque values: 12
# - registration_date : uint8 ; uinque values: 31
# - expiration_year : uint16 ; uinque values: 18
# - expiration_month : uint8 ; uinque values: 12

In [27]:
def one_hot_transform(input_train_data, input_test_data, columns_to_transform):
    for col in columns_to_transform:
        le = LabelEncoder()
        train_values = list(input_train_data[col].unique())
        test_values = list(input_test_data[col].unique())
        le.fit(train_values + test_values)
        input_train_data[col] = le.transform(input_train_data[col])
        input_test_data[col] = le.transform(input_test_data[col])
    return input_train_data, input_test_data

In [28]:
#train, test = one_hot_transform(train, test, ['source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender', 'name'])#, 'artist_name', 'composer', 'lyricist'])

In [29]:
# TODO: wether song_id should be merged like this or not? 231475 reserved and 188364 merged
def encode_with_merge(input_train, input_test, columns, merge_value):
    for index, col in enumerate(columns):
        values_hist = pd.concat([input_train[col], input_train[col]], axis = 0).value_counts()
        reserve_rows = values_hist[values_hist!=merge_value[index]]
        merge_rows = values_hist[values_hist==merge_value[index]]

        reserve_dict = dict(zip(list(reserve_rows.index), list(range(len(reserve_rows)))))
        merge_dict = dict(zip(list(merge_rows.index), [len(reserve_rows)+1]*len(merge_rows.index)))
        
        map_dict = {**reserve_dict, **merge_dict}
        
        language_map['nan'] = int(float(str(language_hist.index.values[0])))
        input_train[col] = input_train[col].map(map_dict)
        input_test[col] = input_test[col].map(map_dict)
    return input_train, input_test

In [30]:
#train, test = encode_with_merge(train, test, ['msno', 'song_id', 'genre_ids'], [1, 1, 1])
# print(train.head())
# print(test.head())

In [31]:
train_org, test_org = train, test

In [32]:
train = train_org.copy(deep=True)
test = test_org.copy(deep=True)

In [33]:
store_test = pd.HDFStore(VALIDATION_INDICE)
validation_list = store_test['keep_index']['index'].values
store_test.close()
train['target'] = pd.to_numeric(train['target'], downcast='signed')
#validation_use = train.iloc[validation_list].copy(deep=True).reset_index(drop=True)
validation_use = train.iloc[list(range(7277417, 7377417))].copy(deep=True).reset_index(drop=True)
#train_use = train.drop(validation_list)
train_use = train.drop(list(range(7277417, 7377417)))
# train['target'] = pd.to_numeric(train['target'], downcast='signed')
# validation_use = train[50:].copy(deep=True).reset_index(drop=True)
# train_use = train.drop(list(range(50,100)))

In [34]:
# for col in train_use.columns: print(col, ':', train_use[col].dtype, '; uinque values:', len(train_use[col].value_counts()))

In [35]:
def log_transform(train_data, validation_data, test_data):
    train_data['song_length'] = np.log(pd.to_numeric(train_data['song_length'], downcast='float') + 1)
    validation_data['song_length'] = np.log(pd.to_numeric(validation_data['song_length'], downcast='float') + 1)
    test_data['song_length'] = np.log(pd.to_numeric(test_data['song_length'], downcast='float') + 1)
    return train_data, validation_data, test_data
train_use, validation_use, test = log_transform(train_use, validation_use, test)

In [36]:
def cal_composer_hot_rate(train_data, val_data, test_data):
    
    temp_data = pd.concat([train_data[['composer']], val_data[['composer']], test_data[['composer']]], axis=0, join="outer")
    temp_data['composer'] = temp_data['composer'].apply(lambda x : x.replace(u'、','|'))

    df_temp = temp_data['composer'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
    df_temp.columns = ['composer_{}'.format(x) for x in df_temp.columns]
   
    temp_data = pd.concat([df_temp['composer_0'], df_temp['composer_1'], df_temp['composer_2']], axis=0, join="outer")
    temp_data.reset_index(drop=True)

    composer_hot = np.log(temp_data.value_counts()+1)
    #composer_hot = temp_data.value_counts()
    composer_hot['nan'] = 0.
    composer_hot['nan'] = composer_hot.mean()
    #print(composer_hot)
    def encoder_each(input_data, hot_hist):
        input_data = input_data.copy()
        input_data['composer'] = input_data['composer'].apply(lambda x : x.replace(u'、','|'))
        df_temp = input_data['composer'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
        df_temp.columns = ['composer_{}'.format(x) for x in df_temp.columns]
        hot_hist = hot_hist.reset_index()
        hot_hist.index.name='index'
        
        hot_hist.columns = ['composer_0', 'composer_0_score']
        df_temp = df_temp.merge(right = hot_hist, how = 'left', on='composer_0')
        hot_hist.columns = ['composer_1', 'composer_1_score']
        df_temp = df_temp.merge(right = hot_hist, how = 'left', on='composer_1')
        hot_hist.columns = ['composer_2', 'composer_2_score']
        df_temp = df_temp.merge(right = hot_hist, how = 'left', on='composer_2')
        df_temp['composer_score'] = df_temp[['composer_0_score','composer_1_score','composer_2_score']].max(axis=1)
        #df_temp['composer_score'] = df_temp['composer_0_score']
        
        input_data['composer_score'] = df_temp['composer_score']
        input_data.drop('composer', inplace=True, axis = 1)
        #input_data = input_data.drop('composer', inplace=False, axis = 1)
        input_data['composer'] = df_temp['composer_0']
        return input_data
    train_data = encoder_each(train_data, composer_hot)
    val_data = encoder_each(val_data, composer_hot)
    test_data = encoder_each(test_data, composer_hot)
    
    return train_data, val_data, test_data
    

In [37]:
train_use, validation_use, test = cal_composer_hot_rate(train_use, validation_use, test)

In [38]:
def cal_lyricist_hot_rate(train_data, val_data, test_data):
    temp_data = pd.concat([train_data[['lyricist']], val_data[['lyricist']], test_data[['lyricist']]], axis=0, join="outer")
    temp_data['lyricist'] = temp_data['lyricist'].apply(lambda x : x.replace(u'、','|'))

    df_temp = temp_data['lyricist'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
    df_temp.columns = ['lyricist_{}'.format(x) for x in df_temp.columns]
   
    #temp_data = df_temp['lyricist_0']
    temp_data = pd.concat([df_temp['lyricist_0'], df_temp['lyricist_1'], df_temp['lyricist_2']], axis=0, join="outer")
    temp_data.reset_index(drop=True)
    lyricist_hot = np.log(temp_data.value_counts()+1)
    #composer_hot = temp_data.value_counts()
    lyricist_hot['nan'] = 0.
    lyricist_hot['nan'] = lyricist_hot.mean()

    #print(lyricist_hot)
    def encoder_each(input_data, hot_hist):
        input_data = input_data.copy()
        input_data['lyricist'] = input_data['lyricist'].apply(lambda x : x.replace(u'、','|'))
        df_temp = input_data['lyricist'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
        df_temp.columns = ['lyricist_{}'.format(x) for x in df_temp.columns]
        hot_hist = hot_hist.reset_index()
        hot_hist.index.name='index'
        
        hot_hist.columns = ['lyricist_0', 'lyricist_0_score']
        df_temp = df_temp.merge(right = hot_hist, how = 'left', on='lyricist_0')
        
        df_temp['lyricist_score'] = df_temp['lyricist_0_score']
        
        input_data['lyricist_score'] = df_temp['lyricist_score']
        input_data.drop('lyricist', inplace=True, axis = 1)
        input_data['lyricist'] = df_temp['lyricist_0']
        return input_data
    train_data = encoder_each(train_data, lyricist_hot)
    val_data = encoder_each(val_data, lyricist_hot)
    test_data = encoder_each(test_data, lyricist_hot)
    
    return train_data, val_data, test_data

In [39]:
train_use, validation_use, test = cal_lyricist_hot_rate(train_use, validation_use, test)

In [40]:
def cal_artist_hot_rate(train_data, val_data, test_data):
    temp_data = pd.concat([train_data[['artist_name']], val_data[['artist_name']], test_data[['artist_name']]], axis=0, join="outer")
    temp_data['artist_name'] = temp_data['artist_name'].apply(lambda x : x.replace(u'、','|'))

    df_temp = temp_data['artist_name'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
    df_temp.columns = ['artist_name_{}'.format(x) for x in df_temp.columns]
   
    #temp_data = df_temp['artist_name_0']
    temp_data = pd.concat([df_temp['artist_name_0'], df_temp['artist_name_1'], df_temp['artist_name_2']], axis=0, join="outer")
    temp_data.reset_index(drop=True)
    artist_hot = np.log(temp_data.value_counts()+1)
    #composer_hot = temp_data.value_counts()
    artist_hot['nan'] = 0.
    artist_hot['nan'] = artist_hot.mean()
    #print(artist_hot)

    def encoder_each(input_data, hot_hist):
        input_data = input_data.copy()
        input_data['artist_name'] = input_data['artist_name'].apply(lambda x : x.replace(u'、','|'))
        df_temp = input_data['artist_name'].str.split('\s{0,}[\|\\\\/]\s{0,}', 3, expand=True)
        df_temp.columns = ['artist_name_{}'.format(x) for x in df_temp.columns]
        hot_hist = hot_hist.reset_index()
        hot_hist.index.name='index'
        
        hot_hist.columns = ['artist_name_0', 'artist_name_0_score']
        df_temp = df_temp.merge(right = hot_hist, how = 'left', on='artist_name_0')
        
        df_temp['artist_name_score'] = df_temp['artist_name_0_score']
        
        input_data['artist_name_score'] = df_temp['artist_name_score']
        input_data.drop('artist_name', inplace=True, axis = 1)
        input_data['artist_name'] = df_temp['artist_name_0']
        return input_data
    train_data = encoder_each(train_data, artist_hot)
    val_data = encoder_each(val_data, artist_hot)
    test_data = encoder_each(test_data, artist_hot)
    
    return train_data, val_data, test_data

In [41]:
train_use, validation_use, test = cal_artist_hot_rate(train_use, validation_use, test)

In [42]:
print(train_use.head().columns)

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'city', 'bd', 'gender', 'registered_via',
       'song_length', 'genre_ids', 'language', 'name', 'country', 'song_year',
       'registration_year', 'registration_month', 'registration_day',
       'expiration_year', 'expiration_month', 'expiration_day', 'days',
       'composer_score', 'composer', 'lyricist_score', 'lyricist',
       'artist_name_score', 'artist_name'],
      dtype='object')


In [43]:
# temp_data = pd.concat([train_use[['composer']], validation_use[['composer']], test[['composer']]], axis=0, join="inner")

# temp_data['composer'].apply(lambda x : len(x.replace(u'、','|').split('|'))).value_counts().plot()

In [44]:
# df = train_use.head(100000).copy(deep=True)
# print(df['lyricist'].value_counts())

In [47]:
time_wnd = [2018, 0]
#time_wnd = [2018, 0]#, 2000, 2010, 2014, 2018]
def cal_song_listen_times(train_data, test_data, val_data):
    all_data = pd.concat([train_data[['song_id', 'song_year', 'msno']], val_data[['song_id', 'song_year', 'msno']], test_data[['song_id', 'song_year', 'msno']]], axis=0, join="inner")
    #all_data['song_id'] = pd.to_numeric(all_data['song_id'], downcast='unsigned')
    #all_data['msno'] = pd.to_numeric(all_data['msno'], downcast='unsigned')
    for index, _ in enumerate(time_wnd[:-1]):
        begin_time, end_time = time_wnd[index] < time_wnd[index+1] and (time_wnd[index], time_wnd[index+1]) or (time_wnd[index+1], time_wnd[index])
#         begin_time = time_wnd[index]
#         end_time = time_wnd[index+1]
        select_data = all_data[all_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
        #select_data['target'] = pd.to_numeric(select_data['target'], downcast='signed')
        
        grouped = select_data[['song_id', 'msno']].groupby(['song_id'])

        count_song = grouped.agg(['count'])
        num_people_per_song = grouped.agg({"msno": lambda x: np.log(x.nunique()+1)})

        popularity = pd.concat([np.log(count_song+1), num_people_per_song], axis=1, join="inner")
        popularity.columns = ['popular_{}'.format(index), 'num_people_{}'.format(index)]
        popularity = popularity.reset_index(drop=False)
        train_data = train_data.merge(popularity, on='song_id', how ='left')
        test_data = test_data.merge(popularity, on='song_id', how ='left')
        val_data = val_data.merge(popularity, on='song_id', how ='left')
    return train_data, test_data, val_data
def cal_song_listen_times_seperate(train_data, test_data, val_data):
    def cal_each_of_them(input_data):
        all_data = input_data[['song_id', 'song_year', 'msno']]
        #all_data['song_id'] = pd.to_numeric(all_data['song_id'], downcast='unsigned')
        #all_data['msno'] = pd.to_numeric(all_data['msno'], downcast='unsigned')
        for index, _ in enumerate(time_wnd[:-1]):
            begin_time, end_time = time_wnd[index] < time_wnd[index+1] and (time_wnd[index], time_wnd[index+1]) or (time_wnd[index+1], time_wnd[index])
    #         begin_time = time_wnd[index]
    #         end_time = time_wnd[index+1]
            select_data = all_data[all_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
            grouped = select_data[['song_id', 'msno']].groupby(['song_id'])

            count_song = grouped.agg(['count'])
            num_people_per_song = grouped.agg({"msno": lambda x: np.log(x.nunique()+1)})

            popularity = pd.concat([np.log(count_song+1), num_people_per_song], axis=1, join="inner")
            popularity.columns = ['popular_{}'.format(index), 'num_people_{}'.format(index)]
            popularity = popularity.reset_index(drop=False)
            all_data = input_data.merge(popularity, on='song_id', how ='left')
        return all_data
    return cal_each_of_them(train_data), cal_each_of_them(test_data), cal_each_of_them(val_data)

# time_wnd = [2018, 0, 2000, 2010, 2014, 2018]
# def cal_song_listen_times(train_data, test_data):
#     test_data['song_id'] = pd.to_numeric(test_data['song_id'], downcast='unsigned')
#     for index, _ in enumerate(time_wnd[:-1]):
#         begin_time, end_time = time_wnd[index] < time_wnd[index+1] and (time_wnd[index], time_wnd[index+1]) or (time_wnd[index+1], time_wnd[index])
# #         begin_time = time_wnd[index]
# #         end_time = time_wnd[index+1]
#         select_data = train_data[train_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
#         select_data['target'] = pd.to_numeric(select_data['target'], downcast='signed')
        
#         grouped = select_data[['song_id', 'target']].groupby(['song_id'])

#         count_song = grouped.agg(['count'])
#         mean_repeat_song = grouped['target'].mean()

#         popularity = pd.concat([np.log(count_song+1), mean_repeat_song, np.log(count_song.multiply(mean_repeat_song, axis=0)+1)], axis=1, join="inner")
#         popularity.columns = ['popular_{}'.format(index), 'mean_repeat_{}'.format(index), 'replay_prob_{}'.format(index)]
#         popularity = popularity.reset_index(drop=False)
#         test_data = test_data.merge(popularity, on='song_id', how ='left')
#         train_data = train_data.merge(popularity, on='song_id', how ='left')
#     return train_data, test_data


# time_wnd = [2018, 0, 2000, 2010, 2014, 2018]
# def cal_song_listen_times(train_data):
#     train_data['song_id'] = pd.to_numeric(train_data['song_id'], downcast='unsigned')
#     for index, _ in enumerate(time_wnd[:-1]):
#         begin_time, end_time = time_wnd[index] < time_wnd[index+1] and (time_wnd[index], time_wnd[index+1]) or (time_wnd[index+1], time_wnd[index])
# #         begin_time = time_wnd[index]
# #         end_time = time_wnd[index+1]
#         select_data = train_data[train_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
#         select_data['target'] = pd.to_numeric(select_data['target'], downcast='signed')
        
#         grouped = select_data[['song_id', 'target']].groupby(['song_id'])

#         count_song = grouped.agg(['count'])
#         mean_repeat_song = grouped['target'].mean()

#         popularity = pd.concat([np.log(count_song+1), mean_repeat_song, np.log(count_song.multiply(mean_repeat_song, axis=0)+1)], axis=1, join="inner")
#         popularity.columns = ['popular_{}'.format(index), 'mean_repeat_{}'.format(index), 'replay_prob_{}'.format(index)]
#         popularity = popularity.reset_index(drop=False)
#         train_data = train_data.merge(popularity, on='song_id', how ='left')
#     return test_data

In [48]:
train_use, test, validation_use = cal_song_listen_times(train_use, test, validation_use)
# train = cal_song_listen_times(train)
# test = cal_song_listen_times(test)

In [None]:
#for col in train_use.columns: print(col, ':', train_use[col].dtype, '; uinque values:', len(train_use[col].value_counts()))

In [49]:
people_time_wnd = [2018, 0]
#people_time_wnd = [2018, 0]#, 2000, 2010, 2014, 2018]
def get_people_active(train_data, test_data, val_data):
    all_data = pd.concat([train_data[['song_id', 'song_year', 'msno']], val_data[['song_id', 'song_year', 'msno']], test_data[['song_id', 'song_year', 'msno']]], axis=0, join="inner")
    #all_data['song_id'] = pd.to_numeric(all_data['song_id'], downcast='unsigned')
    #all_data['msno'] = pd.to_numeric(all_data['msno'], downcast='unsigned')
    for index, _ in enumerate(people_time_wnd[:-1]):
        begin_time, end_time = people_time_wnd[index] < people_time_wnd[index+1] and (people_time_wnd[index], people_time_wnd[index+1]) or (people_time_wnd[index+1], people_time_wnd[index])
#         begin_time = time_wnd[index]
#         end_time = time_wnd[index+1]
        select_data = all_data[all_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
        #select_data['target'] = pd.to_numeric(select_data['target'], downcast='signed')
        
        grouped = select_data[['song_id', 'msno']].groupby(['msno'])

        count_song = grouped.agg(['count'])
        num_people_per_song = grouped.agg({"song_id": lambda x: np.log(x.nunique()+1)})

        popularity = pd.concat([np.log(count_song+1), num_people_per_song], axis=1, join="inner")
        popularity.columns = ['active_{}'.format(index), 'num_song_{}'.format(index)]
        popularity = popularity.reset_index(drop=False)
        train_data = train_data.merge(popularity, on='msno', how ='left')
        test_data = test_data.merge(popularity, on='msno', how ='left')
        val_data = val_data.merge(popularity, on='msno', how ='left')
    return train_data, test_data, val_data
def get_people_active_seperate(train_data, test_data, val_data):
    def cal_each_of_them(input_data):
        all_data = input_data[['song_id', 'song_year', 'msno']]
        #all_data['song_id'] = pd.to_numeric(all_data['song_id'], downcast='unsigned')
        #all_data['msno'] = pd.to_numeric(all_data['msno'], downcast='unsigned')
        for index, _ in enumerate(people_time_wnd[:-1]):
            begin_time, end_time = people_time_wnd[index] < people_time_wnd[index+1] and (people_time_wnd[index], people_time_wnd[index+1]) or (people_time_wnd[index+1], people_time_wnd[index])
    #         begin_time = time_wnd[index]
    #         end_time = time_wnd[index+1]
            select_data = all_data[all_data['song_year'].map(lambda x: x>=begin_time and x < end_time)]
        
            grouped = select_data[['song_id', 'msno']].groupby(['msno'])

            count_song = grouped.agg(['count'])
            num_people_per_song = grouped.agg({"song_id": lambda x: np.log(x.nunique()+1)})

            popularity = pd.concat([np.log(count_song+1), num_people_per_song], axis=1, join="inner")
            popularity.columns = ['active_{}'.format(index), 'num_song_{}'.format(index)]
            popularity = popularity.reset_index(drop=False)
            all_data = input_data.merge(popularity, on='msno', how ='left')
        return all_data
    return cal_each_of_them(train_data), cal_each_of_them(test_data), cal_each_of_them(val_data)

# test = test.reset_index(drop=False)
# #test['msno'] = test['msno'].astype(int)   
# train['target'] = pd.to_numeric(train['target'], downcast='signed')

# grouped = train[['msno', 'target']].groupby(['msno'])

# count_msno = grouped.agg(['count'])
# mean_repeat_msno = grouped['target'].mean()

# popularity = pd.concat([np.log(count_msno+1), mean_repeat_msno, np.log(count_msno.multiply(mean_repeat_msno, axis=0)+1)], axis=1, join="inner")
# popularity.columns = ['ms_popular', 'ms_mean_repeat', 'ms_replay_prob']
# popularity = popularity.reset_index(drop=False)
# test = test.merge(popularity, on='msno', how ='left')
# train = train.merge(popularity, on='msno', how ='left')

In [50]:
train_use, test, validation_use = get_people_active(train_use, test, validation_use)

In [None]:
def measure_by_different_city_lang_country(train_data, test_data, val_data):
    temp_msno_songid = pd.concat([train_data[['composer', 'lyricist', 'artist_name', 'city', 'country', 'language']], val_data[['composer', 'lyricist', 'artist_name', 'city', 'country', 'language']], test_data[['composer', 'lyricist', 'artist_name', 'city', 'country', 'language']]], axis=0, join="outer")
    
    count_dict = dict()
    
    for col in ['composer', 'lyricist', 'artist_name']:
        temp_df = None
        for target in ['city', 'country', 'language']:
            grouped = temp_msno_songid.groupby([col])
            df = grouped.agg({target: lambda x: x.nunique()})
            
            if temp_df is not None:
                temp_df = pd.concat([temp_df, df], axis=1, join="inner")
            else:
                temp_df = df

        temp_df.columns = [col + '_by_{}'.format(index) for index in target]
        
        
        temp_msno_songid['composer'] = df_temp['composer_0']

    

    #count_song = grouped.agg(['count'])
    num_people_per_song = grouped.agg({"msno": lambda x: x.nunique()})
    
    print(num_people_per_song)
    num_people_per_song = grouped.agg({"song_id": lambda x:  x.nunique()})#np.log(x.nunique()+1)})
    print(num_people_per_song)
    return 
    temp_data = df_temp['composer_0']
    print(temp_data)

In [51]:
for col in ['source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender',\
            'name', 'artist_name', 'composer', 'lyricist', 'msno', 'song_id', 'genre_ids',\
           'country', 'language', 'registered_via']:
    train_use[col] = train_use[col].astype('category')
    validation_use[col] = validation_use[col].astype('category')
    test[col] = test[col].astype('category')

In [52]:
for col in train_use.columns: print(col, ':', train_use[col].dtype, '; uinque values:', len(train_use[col].value_counts()))

msno : category ; uinque values: 30571
song_id : category ; uinque values: 357496
source_system_tab : category ; uinque values: 10
source_screen_name : category ; uinque values: 21
source_type : category ; uinque values: 13
target : int8 ; uinque values: 2
city : category ; uinque values: 21
bd : category ; uinque values: 92
gender : category ; uinque values: 3
registered_via : category ; uinque values: 5
song_length : float32 ; uinque values: 60028
genre_ids : category ; uinque values: 573
language : category ; uinque values: 11
name : category ; uinque values: 234144
country : category ; uinque values: 107
song_year : float64 ; uinque values: 100
registration_year : uint16 ; uinque values: 14
registration_month : uint8 ; uinque values: 12
registration_day : uint8 ; uinque values: 31
expiration_year : uint16 ; uinque values: 18
expiration_month : uint8 ; uinque values: 12
expiration_day : uint8 ; uinque values: 31
days : int64 ; uinque values: 4319
composer_score : float64 ; uinque va

In [53]:
for col in test.columns: print(col, ':', test[col].dtype, '; uinque values:', len(test[col].value_counts()))

id : object ; uinque values: 2556790
msno : category ; uinque values: 25131
song_id : category ; uinque values: 224753
source_system_tab : category ; uinque values: 10
source_screen_name : category ; uinque values: 23
source_type : category ; uinque values: 13
city : category ; uinque values: 21
bd : category ; uinque values: 89
gender : category ; uinque values: 3
registered_via : category ; uinque values: 6
song_length : float32 ; uinque values: 45659
genre_ids : category ; uinque values: 502
language : category ; uinque values: 11
name : category ; uinque values: 154716
country : category ; uinque values: 94
song_year : float64 ; uinque values: 100
registration_year : uint16 ; uinque values: 14
registration_month : uint8 ; uinque values: 12
registration_day : uint8 ; uinque values: 31
expiration_year : uint16 ; uinque values: 16
expiration_month : uint8 ; uinque values: 12
expiration_day : uint8 ; uinque values: 31
days : int64 ; uinque values: 4240
composer_score : float64 ; uinque

In [54]:
for col in train_use.columns: print(col, ':', train_use[col].dtype, '; uinque values:', len(train_use[col].value_counts()))

msno : category ; uinque values: 30571
song_id : category ; uinque values: 357496
source_system_tab : category ; uinque values: 10
source_screen_name : category ; uinque values: 21
source_type : category ; uinque values: 13
target : int8 ; uinque values: 2
city : category ; uinque values: 21
bd : category ; uinque values: 92
gender : category ; uinque values: 3
registered_via : category ; uinque values: 5
song_length : float32 ; uinque values: 60028
genre_ids : category ; uinque values: 573
language : category ; uinque values: 11
name : category ; uinque values: 234144
country : category ; uinque values: 107
song_year : float64 ; uinque values: 100
registration_year : uint16 ; uinque values: 14
registration_month : uint8 ; uinque values: 12
registration_day : uint8 ; uinque values: 31
expiration_year : uint16 ; uinque values: 18
expiration_month : uint8 ; uinque values: 12
expiration_day : uint8 ; uinque values: 31
days : int64 ; uinque values: 4319
composer_score : float64 ; uinque va

In [55]:
print(len(test_id), len(test))

2556790 2556790


In [56]:


predictions = np.zeros(shape=[len(test)])

train_data = lgb.Dataset(train_use.drop(['target'],axis=1),label=train_use['target'])
val_data = lgb.Dataset(validation_use.drop(['target'],axis=1),label=validation_use['target'])

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.1 ,
    'verbose': 0,
    'num_leaves': 108,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 128,
    'max_depth': 10,
    'num_rounds': 200,
    'metric' : 'auc',
    } 

bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
predictions+=bst.predict(test.drop(['id'],axis=1))
print('cur fold finished.')

submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)





[1]	valid_0's auc: 0.633637
[2]	valid_0's auc: 0.641298
[3]	valid_0's auc: 0.642856
[4]	valid_0's auc: 0.643825
[5]	valid_0's auc: 0.645342
[6]	valid_0's auc: 0.646425
[7]	valid_0's auc: 0.646916
[8]	valid_0's auc: 0.647135
[9]	valid_0's auc: 0.64819
[10]	valid_0's auc: 0.649298
[11]	valid_0's auc: 0.650209
[12]	valid_0's auc: 0.651037
[13]	valid_0's auc: 0.652429
[14]	valid_0's auc: 0.653274
[15]	valid_0's auc: 0.653858
[16]	valid_0's auc: 0.654175
[17]	valid_0's auc: 0.654961
[18]	valid_0's auc: 0.655386
[19]	valid_0's auc: 0.656219
[20]	valid_0's auc: 0.656772
[21]	valid_0's auc: 0.657357
[22]	valid_0's auc: 0.657511
[23]	valid_0's auc: 0.658149
[24]	valid_0's auc: 0.658683
[25]	valid_0's auc: 0.659072
[26]	valid_0's auc: 0.659686
[27]	valid_0's auc: 0.660407
[28]	valid_0's auc: 0.661173
[29]	valid_0's auc: 0.661536
[30]	valid_0's auc: 0.662121
[31]	valid_0's auc: 0.662484
[32]	valid_0's auc: 0.662864
[33]	valid_0's auc: 0.663241
[34]	valid_0's auc: 0.663462
[35]	valid_0's auc: 0.66

In [None]:
def param_tune_with_val(params, tune_param, param_list, data_list, val_data, less_prefered = False):
    #data_list = {'train':{'x':train_d,'y':train_y}, 'validation':{'x':valid_d,'y':valid_y}}
    best_metric = (less_prefered and sys.float_info.max or -sys.float_info.max)
    best_param = param_list[0]

    for par_value in param_list:
        params[tune_param] = par_value
        # , num_boost_round=params['num_boost_round'], early_stopping_rounds = params['early_stopping_rounds']
        model = lgb.train(params, data_list['train']['x'], valid_sets=[data_list['validation']['x']], \
                feature_name='auto', #categorical_feature=['source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender',\
                                     #                       'bd', 'name', 'artist_name', 'composer', 'lyricist', 'msno', 'song_id', 'genre_ids',\
                                     #                       'country', 'language', 'registered_via'],、
                        )
       
        val_predprob = model.predict(val_data)
        auroc_score = metrics.roc_auc_score(data_list['validation']['y'], val_predprob)

        if (not less_prefered and auroc_score > best_metric) or (less_prefered and auroc_score < best_metric):
            best_metric = auroc_score
            best_param = par_value
    log.info('best param for {}: {}, metric: {}'.format(tune_param, best_param, best_metric))
    return best_param

In [None]:
#{'top_k': 20, 'feature_fraction': 0.8, 'bagging_freq': 1, 'min_data_in_bin': 3, 'min_sum_hessian_in_leaf': 0.001, 'bagging_fraction': 0.9, 'max_depth': 12, 'num_leaves': 100, 'learning_rate': 0.01, 'objective': 'binary', 'lambda_l2': 0.01, 'feature_fraction_seed': 1024, 'min_data_in_leaf': 15, 'max_bin': 100, 'verbose': 0, 'bagging_seed': 6666, 'max_cat_to_onehot': 4, 'metric': 'auc', 'lambda_l1': 1e-05, 'num_threads': 16, 'boosting': 'gbdt', 'min_split_gain': 0.3}

#{'bagging_seed': 6666, 'lambda_l1': 1e-05, 'lambda_l2': 0.01, 'metric': 'auc', 'bagging_freq': 1, 'min_sum_hessian_in_leaf': 0.001, 'feature_fraction': 0.8, 'feature_fraction_seed': 1024, 'num_leaves': 90, 'boosting': 'gbdt', 'verbose': 0, 'min_data_in_leaf': 15, 'top_k': 20, 'objective': 'binary', 'min_data_in_bin': 3, 'num_threads': 16, 'max_cat_to_onehot': 4, 'max_depth': 10, 'bagging_fraction': 0.9, 'learning_rate': 0.01, 'max_bin': 80, 'min_split_gain': 0.3}

In [None]:
def search_for_best_params(train, validation, test):
    
    X_train = lgb.Dataset(np.array(train.drop(['target'], axis=1)), label=train['target'].values)
    X_valid = lgb.Dataset(np.array(validation.drop(['target'], axis=1)), label=validation['target'].values)
    
    y_train = train['target'].values
    y_valid = validation['target'].values

    X_test = np.array(test.drop(['id'], axis=1))

    data_list = {'train':{'x':X_train,'y':y_train}, 'validation':{'x':X_valid,'y':y_valid}}
######## for value rather than catogory ################
#   params_to_eval = OrderedDict(
#         ( 
#         ('num_boost_round', range(120,150,10)),
#         ('num_leaves', range(80,100,10)), # number of leaves in one tree
#         ('max_depth', range(8,12,1)),
#         ('min_data_in_leaf', 15),
#         ('min_sum_hessian_in_leaf', [0.001]),# too high will lead to under-fitting
#         ('min_split_gain',[0.3]),# the minimum loss reduction required to make a split
#         ('bagging_fraction',[0.9]),# [i/10.0 for i in range(6,10)]
#         ('feature_fraction',[0.8]),# typical: 0.5-1
#         ('max_bin', range(70,90,10)),
#         ('lambda_l2',[0.01]),
#         ('lambda_l1',[1e-5]),
#         ('learning_rate',[0.01]), # typical: 0.01-0.2
#         )
#       )
     
#     initial_params = {
#         'objective': 'binary',
#         'boosting': 'gbdt',
#         'num_boost_round': 140,
#         'learning_rate': 0.01 ,
#         'verbose': 0,
#         'num_leaves': 90,
#         'num_threads':16,
#         'max_depth': 9,
#         'min_data_in_leaf': 15, #minimal number of data in one leaf. Can be used to deal with over-fitting
#         'min_sum_hessian_in_leaf': 1e-3, #minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
#         'feature_fraction': 0.8, #colsample_bytree
#         'feature_fraction_seed': 1024,
#         'bagging_fraction': 0.9, #subsample
#         'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration
#         'bagging_seed': 6666,
#         'early_stopping_rounds':10,   
#         'lambda_l1': 1e-5, #L1 regularization
#         'lambda_l2': 0.01, #L2 regularization
#         'max_cat_to_onehot': 4, #when number of categories of one feature smaller than or equal to max_cat_to_onehot, one-vs-other split algorithm will be used
#         'top_k': 20, #set this to larger value for more accurate result, but it will slow down the training speed
#         'min_split_gain': 0.3, #the minimal gain to perform split
#         'max_bin': 70, #max number of bins that feature values will be bucketed in. Small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
#         'min_data_in_bin': 3, #min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting)       
#         'metric' : 'auc',
#     } 
    params_to_eval = OrderedDict(
        ( 
        ('num_boost_round', range(100,400,50)),
        ('num_leaves', range(80,160,10)), # number of leaves in one tree
        ('max_depth', range(8,18,1)),
        ('min_data_in_leaf', range(10,18,2)),
        ('min_sum_hessian_in_leaf', [0.001]),# too high will lead to under-fitting
        ('min_split_gain',[0.3]),# the minimum loss reduction required to make a split
        ('bagging_fraction',[0.9]),# [i/10.0 for i in range(6,10)]
        ('feature_fraction',[0.8]),# typical: 0.5-1
        ('max_bin', range(80,200,10)),
        ('lambda_l2',[0.01]),
        ('lambda_l1',[1e-5]),
        ('learning_rate',[0.01]), # typical: 0.01-0.2
        )
      )
     
    initial_params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_boost_round': 200,
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 120,
        'num_threads':16,
        'max_depth': 14,
        'min_data_in_leaf': 16, #minimal number of data in one leaf. Can be used to deal with over-fitting
        'min_sum_hessian_in_leaf': 1e-3, #minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
        'feature_fraction': 0.8, #colsample_bytree
        'feature_fraction_seed': 1024,
        'bagging_fraction': 0.9, #subsample
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration
        'bagging_seed': 6666,
        'early_stopping_rounds':10,   
        'lambda_l1': 1e-5, #L1 regularization
        'lambda_l2': 0.01, #L2 regularization
        'max_cat_to_onehot': 4, #when number of categories of one feature smaller than or equal to max_cat_to_onehot, one-vs-other split algorithm will be used
        'top_k': 20, #set this to larger value for more accurate result, but it will slow down the training speed
        'min_split_gain': 0.3, #the minimal gain to perform split
        'max_bin': 140, #max number of bins that feature values will be bucketed in. Small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
        'min_data_in_bin': 3, #min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting)       
        'metric' : 'auc',
    } 
    # only param nin this list are tuned, total list are ['n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'max_depth', 'learning_rate', 'gamma']
    #tuned_param_name = ['num_boost_round', 'num_leaves', 'max_depth', 'max_bin']
    tuned_param_name = ['num_boost_round', 'num_leaves', 'max_depth', 'min_data_in_leaf', 'min_sum_hessian_in_leaf',\
                        'min_split_gain', 'bagging_fraction', 'feature_fraction', 'max_bin', 'lambda_l2', 'lambda_l1', 'learning_rate']
    for par_name, par_list in params_to_eval.items():
        if par_name in tuned_param_name:
            log.info('tunning {}...'.format(par_name))
            if len(par_list) > 1:
                initial_params[par_name] = param_tune_with_val(initial_params, par_name, par_list, data_list, np.array(validation.drop(['target'], axis=1)))
            else:
                initial_params[par_name] = par_list[0]
    
    return initial_params

In [None]:
start_time = time.time()
best_param = search_for_best_params(train_use, validation_use, test)
log.info(best_param)
time_elapsed = time.time() - start_time
log.info('time used: {:.3f}sec'.format(time_elapsed))

In [None]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_boost_round': 140,
        'learning_rate': 0.01 ,
        'verbose': 0,
        'num_leaves': 90,
        'num_threads':16,
        'max_depth': 9,
        'min_data_in_leaf': 15, #minimal number of data in one leaf. Can be used to deal with over-fitting
        'min_sum_hessian_in_leaf': 1e-3, #minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
        'feature_fraction': 0.8, #colsample_bytree
        'feature_fraction_seed': 1024,
        'bagging_fraction': 0.9, #subsample
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration
        'bagging_seed': 6666,
        'early_stopping_rounds':10,   
        'lambda_l1': 1e-5, #L1 regularization
        'lambda_l2': 0.01, #L2 regularization
        'max_cat_to_onehot': 4, #when number of categories of one feature smaller than or equal to max_cat_to_onehot, one-vs-other split algorithm will be used
        'top_k': 20, #set this to larger value for more accurate result, but it will slow down the training speed
        'min_split_gain': 0.3, #the minimal gain to perform split
        'max_bin': 70, #max number of bins that feature values will be bucketed in. Small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
        'min_data_in_bin': 3, #min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting)       
        'metric' : 'auc',
    } 
X_train = lgb.Dataset(np.array(train_use.drop(['target'], axis=1)), label=train_use['target'].values)
X_valid = lgb.Dataset(np.array(validation_use.drop(['target'], axis=1)), label=validation_use['target'].values)
X_test = np.array(test.drop(['id'], axis=1))
model = lgb.train(params, X_train, valid_sets=[X_valid])
pred = model.predict(X_test)

submission = pd.DataFrame({'id': test_id, 'target': pred})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)


In [None]:
X_train = np.array(train_use.drop(['target'], axis=1))
y_train = train_use['target'].values

X_valid = np.array(validation_use.drop(['target'], axis=1))
y_valid = validation_use['target'].values

X_test = np.array(test.drop(['id'], axis=1))

# d_train = xgb.DMatrix(X_train)
# d_valid = xgb.DMatrix(X_valid) 
# d_test = xgb.DMatrix(X_test)

data_list = {'train':{'x':X_train,'y':y_train}, 'validation':{'x':X_valid,'y':y_valid}}
# Train model, evaluate and make predictions
params={
    'n_estimators':500,
    'objective': 'binary:logistic',
    'learning_rate': 0.75,
    'gamma':0.1,
    'subsample':0.8,
    'colsample_bytree':0.3,
    'min_child_weight':3,
    'max_depth':16,
    'seed':1024,
    }

param_tune_with_val(params, 'max_depth', [5,1,6], data_list, 'auc', 20)

# model = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=20, \
#     maximize=True, verbose_eval=5)



In [None]:
X_train = np.array(train_use.drop(['target'], axis=1))
y_train = train_use['target'].values

X_valid = np.array(validation_use.drop(['target'], axis=1))
y_valid = validation_use['target'].values

X_test = np.array(test.drop(['id'], axis=1))

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid) 
d_test = xgb.DMatrix(X_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train model, evaluate and make predictions
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.75
params['max_depth'] = 16
params['silent'] = 1
params['eval_metric'] = 'auc'

model = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=20, \
    maximize=True, verbose_eval=5)

#Predict training set:
train_predictions = model.predict(X_train)
train_predprob = model.predict_proba(X_train)[:,1]

val_predictions = model.predict(X_valid)
val_predprob = model.predict_proba(X_valid)[:,1]

#Print model report:
print("\nModel Report")
print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, train_predictions))
print("Train AUC Score (Train): %f" % metrics.roc_auc_score(y_train, train_predprob))
print("ValAccuracy : %.4g" % metrics.accuracy_score(y_valid, val_predictions))
print("Validation AUC Score (Train): %f" % metrics.roc_auc_score(y_valid, val_predprob))

feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

p_test = model.predict(d_test)

In [None]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27)
modelfit(xgb1, train_use.drop(['target'],axis=1), train_use['target'], validation_use.drop(['target'],axis=1), validation_use['target'])

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'], eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
def modelfit(alg, train, label, validation, val_label, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train.values, label=label.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train, label, eval_metric='auc')
        
    #Predict training set:
    train_predictions = alg.predict(train)
    train_predprob = alg.predict_proba(train)[:,1]
    
    val_predictions = alg.predict(validation)
    val_predprob = alg.predict_proba(validation)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Train Accuracy : %.4g" % metrics.accuracy_score(label.values, train_predictions))
    print("Train AUC Score (Train): %f" % metrics.roc_auc_score(label, train_predprob))
    print("ValAccuracy : %.4g" % metrics.accuracy_score(val_label.values, val_predictions))
    print("Validation AUC Score (Train): %f" % metrics.roc_auc_score(val_label, val_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27)
modelfit(xgb1, train_use.drop(['target'],axis=1), train_use['target'], validation_use.drop(['target'],axis=1), validation_use['target'])

In [None]:
import lightgbm as lgb

predictions = np.zeros(shape=[len(test)])


train_data = lgb.Dataset(train_use.drop(['target'],axis=1), label=train_use['target'])
val_data = lgb.Dataset(validation_use.drop(['target'],axis=1), label=validation_use['target'])

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.1 ,
    'verbose': 0,
    'num_leaves': 108,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 128,
    'max_depth': 10,
    'num_rounds': 200,
    'metric' : 'auc',
    } 

bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
predictions=bst.predict(test.drop(['id'],axis=1))
print('finished.')

    
predictions = predictions/3

submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)

In [None]:
import lightgbm as lgb


kf = KFold(n_splits=3)

predictions = np.zeros(shape=[len(test)])

for train_indices,val_indices in kf.split(train) : 
    train_data = lgb.Dataset(train.drop(['target'],axis=1).loc[train_indices,:],label=train.loc[train_indices,'target'])
    val_data = lgb.Dataset(train.drop(['target'],axis=1).loc[val_indices,:],label=train.loc[val_indices,'target'])
    
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 128,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc',
        } 
    
    bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
    predictions+=bst.predict(test.drop(['id'],axis=1))
    print('cur fold finished.')
    del bst
    
predictions = predictions/3

submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)

In [None]:
# Preprocess songs data
songs_genres = np.array(songs['genre_ids']\
    .apply(lambda x: [int(v) for v in str(x).split('|')]))
genres_list = songs_genres.ravel().unique()
print('Number of genres: ' + str(len(genres_list)))

ohe_genres = np.zeros((len(songs_genres), len(genres_list)))
for s_i, s_genres in enumerate(songs_genres):
    for genre in s_genres:
        g_i = genres_list.find(genre)
        ohe_genres[s_i, g_i] = 1
        
for g_i, g in enumerate(genres_list):
    songs['genre_' + str(g)] = ohe_genres[:, g_i]
print(songs.head())
songs = songs.drop(['genre_ids'], axis=1)

song_cols = songs.columns

# Preprocess dataset
train = train.fillna(-1)
test = test.fillna(-1)

cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

        print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))


In [None]:
########################################
## import packages
########################################

import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD

########################################
## load the data
########################################

train = pd.read_csv('./data/train.csv')
uid = train.msno
sid = train.song_id
target = train.target

test = pd.read_csv('./data/test.csv')
id_test = test.id
uid_test = test.msno
sid_test = test.song_id

########################################
## encoding
########################################

usr_encoder = LabelEncoder()
usr_encoder.fit(uid.append(uid_test))
uid = usr_encoder.transform(uid)
uid_test = usr_encoder.transform(uid_test)

sid_encoder = LabelEncoder()
sid_encoder.fit(sid.append(sid_test))
sid = sid_encoder.transform(sid)
sid_test = sid_encoder.transform(sid_test)

u_cnt = int(max(uid.max(), uid_test.max()) + 1)
s_cnt = int(max(sid.max(), sid_test.max()) + 1)

########################################
## train-validation split
########################################

perm = np.random.permutation(len(train))
trn_cnt = int(len(train) * 0.85)
uid_trn = uid[perm[:trn_cnt]]
uid_val = uid[perm[trn_cnt:]]
sid_trn = sid[perm[:trn_cnt]]
sid_val = sid[perm[trn_cnt:]]
target_trn = target[perm[:trn_cnt]]
target_val = target[perm[trn_cnt:]]

########################################
## define the model
########################################

def get_model():
    user_embeddings = Embedding(u_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)
    song_embeddings = Embedding(s_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)

    uid_input = Input(shape=(1,), dtype='int32')
    embedded_usr = user_embeddings(uid_input)
    embedded_usr = Reshape((64,))(embedded_usr)

    sid_input = Input(shape=(1,), dtype='int32')
    embedded_song = song_embeddings(sid_input)
    embedded_song = Reshape((64,))(embedded_song)

    preds = dot([embedded_usr, embedded_song], axes=1)
    preds = concatenate([embedded_usr, embedded_song, preds])
    
    preds = Dense(128, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(1, activation='sigmoid')(preds)

    model = Model(inputs=[uid_input, sid_input], outputs=preds)
    
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

    return model

########################################
## train the model
########################################
   
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=5)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
        save_weights_only=True)

hist = model.fit([uid_trn, sid_trn], target_trn, validation_data=([uid_val, sid_val], \
        target_val), epochs=100, batch_size=32768, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])
model.load_weights(model_path)

preds_val = model.predict([uid_val, sid_val], batch_size=32768)
val_auc = roc_auc_score(target_val, preds_val)

########################################
## make the submission
########################################

preds_test = model.predict([uid_test, sid_test], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': id_test, 'target': preds_test.ravel()})
sub.to_csv('./sub_%.5f.csv'%(val_auc), index=False)

In [None]:
# Linear algebra:
import numpy as np
import pandas as pd
# Graphics:
import matplotlib.pyplot as plt
import seaborn as sns  
# Frameworks:
import lightgbm as lgb # LightGBM
# Utils:
import gc # garbage collector
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

IDIR = '../input/' # main path
members = pd.read_csv(IDIR + 'members.csv')
songs = pd.read_csv(IDIR + 'songs.csv')
song_extra_info = pd.read_csv(IDIR + 'song_extra_info.csv')
train = pd.read_csv(IDIR + 'train.csv')
test = pd.read_csv(IDIR + 'test.csv')

# Adding songs' info:
train_aug1 = pd.merge(left=train, right=songs, on='song_id', how='left')
test_aug1 = pd.merge(left=test, right=songs, on='song_id', how='left')
# Adding extra info about songs:
train_aug2 = pd.merge(left=train_aug1, right=song_extra_info, on='song_id', how='left')
test_aug2 = pd.merge(left=test_aug1, right=song_extra_info, on='song_id', how='left')
del train_aug1, test_aug1
# Addind users' info:
train_aug3 = pd.merge(left=train_aug2, right=members, on='msno', how='left')
test_aug3 = pd.merge(left=test_aug2, right=members, on='msno', how='left')
del train_aug2, test_aug2
# Merging train and test data:
train_aug3.drop(['song_id'], axis=1, inplace=True)
train_aug3['set'] = 0
test_aug3.drop(['song_id'], axis=1, inplace=True)
test_aug3['set'] = 1
test_aug3['target'] = -1
all_aug = pd.concat([train_aug3, test_aug3], axis=0)
del train_aug3, test_aug3
gc.collect();



# source_system_tab/source_screen_name/source_type/genre_ids/artist_name/composer/lyricist/name/isrc/gender 用'NA'填补并one-hot编码
# genre_ids encoding:
all_aug['genre_ids'] = all_aug.genre_ids.fillna('NA')
all_aug['genre_ids'] = all_aug.genre_ids.astype(np.str)
genre_ids_le = LabelEncoder()
genre_ids_le.fit(all_aug.genre_ids)
all_aug['genre_ids'] = genre_ids_le.transform(all_aug.genre_ids).astype(np.int16)

# language encoding:
all_aug['language'] = all_aug.language.fillna(-2)
all_aug['language'] = all_aug.language.astype(np.int8)

# city encoding:
all_aug['city'] = all_aug.city.astype(np.int8)
# bd encoding:
all_aug['bd'] = all_aug.bd.astype(np.int16)

# registered_via encoding:
all_aug['registered_via'] = all_aug.registered_via.astype(np.int8)
# registration_init_time encoding:
all_aug['registration_init_time'] = all_aug.registration_init_time.astype(np.int32)
# expiration_date encoding:
all_aug['expiration_date'] = all_aug.expiration_date.astype(np.int32)
# Info:
all_aug.info(max_cols=0)
all_aug.head(2)


all_aug['exp_reg_time'] = all_aug.expiration_date - all_aug.registration_init_time



gc.collect();
d_train = lgb.Dataset(all_aug[all_aug.set == 0].drop(['target', 'msno', 'id', 'set'], axis=1), 
                      label=all_aug[all_aug.set == 0].pop('target'))
ids_train = all_aug[all_aug.set == 0].pop('msno')

lgb_params = {
    'learning_rate': 1.0,
    'max_depth': 15,
    'num_leaves': 250, 
    'objective': 'binary',
    'metric': {'auc'},
    'feature_fraction': 0.8,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'max_bin': 100}
cv_result_lgb = lgb.cv(lgb_params, 
                       d_train, 
                       num_boost_round=5000, 
                       nfold=3, 
                       stratified=True, 
                       early_stopping_rounds=50, 
                       verbose_eval=100, 
                       show_stdv=True)

num_boost_rounds_lgb = len(cv_result_lgb['auc-mean'])
print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))



%%time
ROUNDS = num_boost_rounds_lgb
print('light GBM train :-)')
bst = lgb.train(lgb_params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
# del d_train
gc.collect()


plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
feature_imp = pd.Series(dict(zip(d_train.feature_name, 
                                 bst.feature_importance()))).sort_values(ascending=False)
sns.barplot(x=feature_imp.values, y=feature_imp.index.values, orient='h', color='g')
plt.subplot(1,2,2)
train_scores = np.array(cv_result_lgb['auc-mean'])
train_stds = np.array(cv_result_lgb['auc-stdv'])
plt.plot(train_scores, color='green')
plt.fill_between(range(len(cv_result_lgb['auc-mean'])), 
                 train_scores - train_stds, train_scores + train_stds, 
                 alpha=0.1, color='green')
plt.title('LightGMB CV-results')
plt.show()