In [None]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
get_ipython().magic('env CUDA_VISIBLE_DEVICES =  ')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path
import random
import time
from collections import OrderedDict
import io
from datetime import datetime
import gc # garbage collector
import sklearn
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import math
import sys
from collections import defaultdict
import re
import logging
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')

## Write a pandas dataframe to disk as gunzip compressed csv
- df.to_csv('dfsavename.csv.gz', compression='gzip')

## Read from disk
- df = pd.read_csv('dfsavename.csv.gz', compression='gzip')

## Magic useful
- %%timeit for the whole cell
- %timeit for the specific line
- %%latex to render the cell as a block of latex
- %prun and %%prun

In [None]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/WSDM/'
HDF_FILENAME = DATASET_PATH + 'datas.h5'
SUBMISSION_FILENAME = DATASET_PATH + 'submission_{}.csv'
VALIDATION_INDICE = DATASET_PATH + 'validation_indice.csv'

In [None]:
def set_logging(logger_name, logger_file_name):
    log = logging.getLogger(logger_name)
    log.setLevel(logging.DEBUG)

    # create formatter and add it to the handlers
    print_formatter = logging.Formatter('%(message)s')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s_%(levelname)s: %(message)s')

    # create file handler which logs even debug messages
    fh = logging.FileHandler(logger_file_name, mode='w')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(file_formatter)
    log.addHandler(fh)
    # both output to console and file
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(print_formatter)
    log.addHandler(consoleHandler)
    
    return log

In [None]:
log = set_logging('MUSIC', DATASET_PATH + 'music_xgboost.log')
log.info('here is an info message.')

In [None]:
# TRAIN_FILE = DATASET_PATH + 'train.csv'
# TEST_FILE = DATASET_PATH + 'test.csv'
# MEMBER_FILE = DATASET_PATH + 'members.csv'
# SONG_FILE = DATASET_PATH + 'fix_songs.csv'
# SONG_EXTRA_FILE = DATASET_PATH + 'song_extra_info.csv'

# train_data = pd.read_csv(TRAIN_FILE)
# test_data = pd.read_csv(TEST_FILE)
# member_data = pd.read_csv(MEMBER_FILE)
# song_data = pd.read_csv(SONG_FILE)
# song_extra_data = pd.read_csv(SONG_EXTRA_FILE)

# songs_all = pd.merge(left = song_data, right = song_extra_data, how = 'left', on='song_id')
# train_with_mem = pd.merge(left = train_data, right = member_data, how = 'left', on='msno')
# train_all = pd.merge(left = train_with_mem, right = songs_all, how = 'left', on='song_id')
# test_with_mem = pd.merge(left = test_data, right = member_data, how = 'left', on='msno')
# test_all = pd.merge(left = test_with_mem, right = songs_all, how = 'left', on='song_id')
# del train_with_mem, test_with_mem; gc.collect()

# def convert_unicode_to_str(df):
#     df.columns = df.columns.astype(str)
#     types = df.apply(lambda x: pd.api.types.infer_dtype(df.values))
#     #print(types)#mixed-integer
#     for col in types[types == 'mixed-integer'].index:
#         df[col] = df[col].astype(str)
#     for col in types[types == 'mixed'].index:
#         df[col] = df[col].astype(str)
#     return df

# store = pd.HDFStore(HDF_FILENAME)
# store['train_data'] = convert_unicode_to_str(train_all)
# store['test_data'] = convert_unicode_to_str(test_all)
# store['song_data'] = convert_unicode_to_str(songs_all)
# store['test_id'] = test_data.id
# store.close()

In [None]:
# store_test = pd.HDFStore(HDF_FILENAME)
# train = store_test['train_data'][0:100]
# test = store_test['test_data'][0:100]
# test_id =  store_test['test_id'][0:100]
# store_test.close()
store_test = pd.HDFStore(HDF_FILENAME)
train = store_test['train_data']
test = store_test['test_data']
test_id =  store_test['test_id']
store_test.close()

In [None]:
def split_country(input_data):
    def get_country(isrc):
        if isinstance(isrc, str) and isrc != 'nan':
            return isrc[0:2]
        else:
            return np.nan
    countries = train['isrc'].apply(get_country)
    country_list = list(countries.value_counts().index)
    country_map = dict(zip(country_list, country_list))
    country_map['QM'] = 'QZ'
    country_map['US'] = 'QZ'
    return countries.map(country_map)

In [None]:
train['country'] = split_country(train)
test['country'] = split_country(test)

In [None]:
def isrc_to_year(isrc):
    if isinstance(isrc, str) and isrc != 'nan':
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
train['song_year'] = train['isrc'].apply(isrc_to_year)
test['song_year'] = test['isrc'].apply(isrc_to_year)
train.drop(['isrc'], axis = 1, inplace = True)
test.drop(['isrc'], axis = 1, inplace = True)

In [None]:
def split_reg_date(input_data):
    input_data['registration_year'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[0:4]))
    input_data['registration_year'] = pd.to_numeric(input_data['registration_year'], downcast='unsigned')

    input_data['registration_month'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[4:6]))
    input_data['registration_month'] = pd.to_numeric(input_data['registration_month'], downcast='unsigned')

    input_data['registration_day'] = input_data['registration_init_time'].apply(lambda x : int(str(x)[6:8]))
    input_data['registration_day'] = pd.to_numeric(input_data['registration_day'], downcast='unsigned')

    return input_data

In [None]:
def split_expir_date(input_data):
    input_data['expiration_year'] = input_data['expiration_date'].apply(lambda x : int(str(x)[0:4]))
    input_data['expiration_year'] = pd.to_numeric(input_data['expiration_year'], downcast='unsigned')

    input_data['expiration_month'] = input_data['expiration_date'].apply(lambda x : int(str(x)[4:6]))
    input_data['expiration_month'] = pd.to_numeric(input_data['expiration_month'], downcast='unsigned')

    input_data['expiration_day'] = input_data['expiration_date'].apply(lambda x : int(str(x)[6:8]))
    input_data['expiration_day'] = pd.to_numeric(input_data['expiration_day'], downcast='unsigned')
    
    return input_data

In [None]:
def date_to_day(input_data):
    # 转换注册时间
    input_data['registration_init_time'] = pd.to_datetime(input_data['registration_init_time'],format="%Y%m%d")
    input_data['expiration_date'] = pd.to_datetime(input_data['expiration_date'],format="%Y%m%d")
    days = input_data.expiration_date - input_data.registration_init_time
    days = [d.days for d in days]
    input_data['days']=days
    
    return input_data

In [None]:
train = split_reg_date(train)
test = split_reg_date(test)
train = split_expir_date(train)
test = split_expir_date(test)

train = date_to_day(train)
test = date_to_day(test)

train.drop('registration_init_time',axis=1,inplace=True)
train.drop('expiration_date',axis=1,inplace=True)
test.drop('registration_init_time',axis=1,inplace=True)
test.drop('expiration_date',axis=1,inplace=True)

In [None]:
train['song_length'] = pd.to_numeric(train['song_length'].replace('nan', '235415.0'), downcast='unsigned')
test['song_length'] = pd.to_numeric(test['song_length'].replace('nan', '235415.0'), downcast='unsigned')

In [None]:
for col in train.columns: print(col, ':', train[col].dtype)

In [None]:
for col in [col for col in test.columns if col != 'id' ]:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [None]:
# encode registered_via, the less number of occurrences are merged into the top item which has the max number of occurrences
registered_via_hist = pd.concat([train['registered_via'], test['registered_via']], axis = 0).value_counts()
registered_via_map = dict(zip(registered_via_hist.index, [int(s) for s in registered_via_hist.index.values]))
registered_via_map[registered_via_hist.index[-1]] = int(str(registered_via_hist.index.values[0]))
train['registered_via'] = train['registered_via'].map(registered_via_map)
test['registered_via'] = test['registered_via'].map(registered_via_map)

In [None]:
# encode language, fill nan with most occurrences item
language_hist = pd.concat([train['language'], test['language']], axis = 0).value_counts()
language_map = dict(zip(language_hist.index, [int(float(s)) for s in language_hist.index.values if s != 'nan']))
language_map['nan'] = int(float(str(language_hist.index.values[0])))
train['language'] = train['language'].map(language_map)
test['language'] = test['language'].map(language_map)

In [None]:
# encode country, fill nan with most occurrences item
country_hist = pd.concat([train['country'], test['country']], axis = 0).value_counts()
merge_per = 0.25
country_map = dict(zip(country_hist.index, list(range(len(country_hist)))))
for key in list(country_hist[-int(len(country_hist)*merge_per):].index):
    country_map[key] = int(len(country_hist)*(1-merge_per)) + 1
train['country'] = train['country'].map(country_map)
test['country'] = test['country'].map(country_map)

In [None]:
# msno : category ; uinque values: 30755
# song_id : category ; uinque values: 359966
# - source_system_tab : category ; uinque values: 10
# - source_screen_name : category ; uinque values: 21
# - source_type : category ; uinque values: 13
# - target : object ; uinque values: 2
# - city : category ; uinque values: 21
# - bd : category ; uinque values: 92
# - gender : category ; uinque values: 3
# - registered_via : category ; uinque values: 5
# song_length : uint32 ; uinque values: 60271
# genre_ids : category ; uinque values: 573
# artist_name : category ; uinque values: 40587
# composer : category ; uinque values: 76072
# lyricist : category ; uinque values: 33895
# - language : category ; uinque values: 11
# name : category ; uinque values: 234144
# - country : category ; uinque values: 107
# - song_year : float64 ; uinque values: 100
# - registration_year : uint16 ; uinque values: 14
# - registration_month : uint8 ; uinque values: 12
# - registration_date : uint8 ; uinque values: 31
# - expiration_year : uint16 ; uinque values: 18
# - expiration_month : uint8 ; uinque values: 12

In [None]:
def one_hot_transform(input_train_data, input_test_data, columns_to_transform):
    for col in columns_to_transform:
        le = LabelEncoder()
        train_values = list(input_train_data[col].unique())
        test_values = list(input_test_data[col].unique())
        le.fit(train_values + test_values)
        input_train_data[col] = le.transform(input_train_data[col])
        input_test_data[col] = le.transform(input_test_data[col])
    return input_train_data, input_test_data

In [None]:
train, test = one_hot_transform(train, test, ['source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender', 'bd', 'name', 'artist_name', 'composer', 'lyricist'])

In [None]:
# TODO: wether song_id should be merged like this or not? 231475 reserved and 188364 merged
def encode_with_merge(input_train, input_test, columns, merge_value):
    for index, col in enumerate(columns):
        values_hist = pd.concat([input_train[col], input_train[col]], axis = 0).value_counts()
        reserve_rows = values_hist[values_hist!=merge_value[index]]
        merge_rows = values_hist[values_hist==merge_value[index]]

        reserve_dict = dict(zip(list(reserve_rows.index), list(range(len(reserve_rows)))))
        merge_dict = dict(zip(list(merge_rows.index), [len(reserve_rows)+1]*len(merge_rows.index)))
        
        map_dict = {**reserve_dict, **merge_dict}
        
        language_map['nan'] = int(float(str(language_hist.index.values[0])))
        input_train[col] = input_train[col].map(map_dict)
        input_test[col] = input_test[col].map(map_dict)
    return input_train, input_test

In [None]:
train, test = encode_with_merge(train, test, ['msno', 'song_id', 'genre_ids'], [1, 1, 1])
# print(train.head())
# print(test.head())

In [None]:
# pd.to_numeric(train['language'], downcast='signed')

In [None]:
for col in test.columns: print(col, ':', test[col].dtype, '; uinque values:', len(test[col].value_counts()))

In [None]:
for col in train.columns: print(col, ':', train[col].dtype, '; uinque values:', len(train[col].value_counts()))

In [None]:
store_test = pd.HDFStore(VALIDATION_INDICE)
validation_list = store_test['keep_index']['index'].values
store_test.close()
train['target'] = pd.to_numeric(train['target'], downcast='signed')
validation_use = train.iloc[validation_list].copy(deep=True).reset_index(drop=True)
train_use = train.drop(validation_list)
# train['target'] = pd.to_numeric(train['target'], downcast='signed')
# validation_use = train[50:].copy(deep=True).reset_index(drop=True)
# train_use = train.drop(list(range(50,100)))

In [None]:
def param_tune_with_val(params, tune_param, param_list, data_list, eval_metric, early_stopping_rounds, less_prefered = False):
    #data_list = {'train':{'x':train_d,'y':train_y}, 'validation':{'x':valid_d,'y':valid_y}}
    best_metric = (less_prefered and sys.float_info.max or -sys.float_info.max)
    best_param = param_list[0]

    for par_value in param_list:
        params[tune_param] = par_value
        
        model = XGBClassifier(**params)
        model.fit(data_list['train']['x'], data_list['train']['y'], \
                #eval_set=[(data_list['train']['x'], data_list['train']['y']), (data_list['validation']['x'], data_list['validation']['y'])], \
                eval_set=[(data_list['validation']['x'], data_list['validation']['y'])], \
                eval_metric=eval_metric, early_stopping_rounds = early_stopping_rounds)
       
        val_predprob = model.predict_proba(data_list['validation']['x'])[:,1]
        auroc_score = metrics.roc_auc_score(data_list['validation']['y'], val_predprob)

        if (not less_prefered and auroc_score > best_metric) or (less_prefered and auroc_score < best_metric):
            best_metric = auroc_score
            best_param = par_value
    log.info('best param for {}: {}, metric: {}'.format(tune_param, best_param, best_metric))
    return best_param

In [None]:
def search_for_best_params(train, validation, test):
    X_train = np.array(train.drop(['target'], axis=1))
    y_train = train['target'].values

    X_valid = np.array(validation.drop(['target'], axis=1))
    y_valid = validation['target'].values

    X_test = np.array(test.drop(['id'], axis=1))

    data_list = {'train':{'x':X_train,'y':y_train}, 'validation':{'x':X_valid,'y':y_valid}}

    params_to_eval = OrderedDict(
        ( ('max_depth', range(12,16,1)),# typical: 3-10
        ('min_child_weight', range(1,6,2)),# too high will lead to under-fitting
        ('gamma',[i/10.0 for i in range(0,5)]),# the minimum loss reduction required to make a split
        ('subsample',[i/10.0 for i in range(6,10)]),# typical: 0.5-1
        ('colsample_bytree',[i/10.0 for i in range(6,10)]),# typical: 0.5-1
        ('reg_lambda',[0.01, 0.1, 1]),
        ('reg_alpha',[1e-5, 1e-2, 0.1, 1, 100]),
        ('learning_rate',[0.01]), # typical: 0.01-0.2
        ('n_estimators',range(400,800,100)) )
      )
     

#     params_to_eval = {
#         'max_depth': range(4,16,2),# typical: 3-10
#         'min_child_weight': range(1,6,2),# too high will lead to under-fitting
#         'gamma':[i/10.0 for i in range(0,5)],# the minimum loss reduction required to make a split
#         'subsample':[i/10.0 for i in range(6,10)],# typical: 0.5-1
#         'colsample_bytree':[i/10.0 for i in range(6,10)],# typical: 0.5-1
#         'reg_lambda':[0.01, 0.1, 1],
#         'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
#         'learning_rate':[0.01], # typical: 0.01-0.2
#         'n_estimators':range(400,800,100)
#         }
    initial_params={
            'n_estimators':400,
            'objective': 'binary:logistic',
            'learning_rate': 0.1,
            'gamma':0.1,
            'subsample':0.8,
            'colsample_bytree':0.8,
            'min_child_weight':1,
            'max_depth':8,
            'reg_lambda':1,
            'reg_alpha':0,
            'seed':1024,
            'nthread':10,
            'silent':True
        }
    # only param nin this list are tuned, total list are ['n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'max_depth', 'learning_rate', 'gamma']
    tuned_param_name = ['n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'max_depth', 'learning_rate', 'gamma']
    for par_name, par_list in params_to_eval.items():
        if par_name in tuned_param_name:
            log.info('tunning {}...'.format(par_name))
            if len(par_list) > 1:
                initial_params[par_name] = param_tune_with_val(initial_params, par_name, par_list, data_list, 'auc', 3)
            else:
                initial_params[par_name] = par_list[0]
    
    return initial_params

In [None]:
start_time = time.time()
best_param = search_for_best_params(train_use, validation_use, test)
log.info(best_param)
time_elapsed = time.time() - start_time
log.info('time used: {:.3f}sec'.format(time_elapsed))

In [None]:
X_train = np.array(train_use.drop(['target'], axis=1))
y_train = train_use['target'].values

X_valid = np.array(validation_use.drop(['target'], axis=1))
y_valid = validation_use['target'].values

X_test = np.array(test.drop(['id'], axis=1))

# d_train = xgb.DMatrix(X_train)
# d_valid = xgb.DMatrix(X_valid) 
# d_test = xgb.DMatrix(X_test)

data_list = {'train':{'x':X_train,'y':y_train}, 'validation':{'x':X_valid,'y':y_valid}}
# Train model, evaluate and make predictions
params={
    'n_estimators':500,
    'objective': 'binary:logistic',
    'learning_rate': 0.75,
    'gamma':0.1,
    'subsample':0.8,
    'colsample_bytree':0.3,
    'min_child_weight':3,
    'max_depth':16,
    'seed':1024,
    }

param_tune_with_val(params, 'max_depth', [5,1,6], data_list, 'auc', 20)

# model = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=20, \
#     maximize=True, verbose_eval=5)



In [None]:
X_train = np.array(train_use.drop(['target'], axis=1))
y_train = train_use['target'].values

X_valid = np.array(validation_use.drop(['target'], axis=1))
y_valid = validation_use['target'].values

X_test = np.array(test.drop(['id'], axis=1))

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid) 
d_test = xgb.DMatrix(X_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train model, evaluate and make predictions
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.75
params['max_depth'] = 16
params['silent'] = 1
params['eval_metric'] = 'auc'

model = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=20, \
    maximize=True, verbose_eval=5)

#Predict training set:
train_predictions = model.predict(X_train)
train_predprob = model.predict_proba(X_train)[:,1]

val_predictions = model.predict(X_valid)
val_predprob = model.predict_proba(X_valid)[:,1]

#Print model report:
print("\nModel Report")
print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, train_predictions))
print("Train AUC Score (Train): %f" % metrics.roc_auc_score(y_train, train_predprob))
print("ValAccuracy : %.4g" % metrics.accuracy_score(y_valid, val_predictions))
print("Validation AUC Score (Train): %f" % metrics.roc_auc_score(y_valid, val_predprob))

feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

p_test = model.predict(d_test)

In [None]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27)
modelfit(xgb1, train_use.drop(['target'],axis=1), train_use['target'], validation_use.drop(['target'],axis=1), validation_use['target'])

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'], eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
def modelfit(alg, train, label, validation, val_label, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train.values, label=label.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train, label, eval_metric='auc')
        
    #Predict training set:
    train_predictions = alg.predict(train)
    train_predprob = alg.predict_proba(train)[:,1]
    
    val_predictions = alg.predict(validation)
    val_predprob = alg.predict_proba(validation)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Train Accuracy : %.4g" % metrics.accuracy_score(label.values, train_predictions))
    print("Train AUC Score (Train): %f" % metrics.roc_auc_score(label, train_predprob))
    print("ValAccuracy : %.4g" % metrics.accuracy_score(val_label.values, val_predictions))
    print("Validation AUC Score (Train): %f" % metrics.roc_auc_score(val_label, val_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27)
modelfit(xgb1, train_use.drop(['target'],axis=1), train_use['target'], validation_use.drop(['target'],axis=1), validation_use['target'])

In [None]:
import lightgbm as lgb

predictions = np.zeros(shape=[len(test)])


train_data = lgb.Dataset(train_use.drop(['target'],axis=1), label=train_use['target'])
val_data = lgb.Dataset(validation_use.drop(['target'],axis=1), label=validation_use['target'])

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.1 ,
    'verbose': 0,
    'num_leaves': 108,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 128,
    'max_depth': 10,
    'num_rounds': 200,
    'metric' : 'auc',
    } 

bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
predictions=bst.predict(test.drop(['id'],axis=1))
print('finished.')

    
predictions = predictions/3

submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)

In [None]:
import lightgbm as lgb


kf = KFold(n_splits=3)

predictions = np.zeros(shape=[len(test)])

for train_indices,val_indices in kf.split(train) : 
    train_data = lgb.Dataset(train.drop(['target'],axis=1).loc[train_indices,:],label=train.loc[train_indices,'target'])
    val_data = lgb.Dataset(train.drop(['target'],axis=1).loc[val_indices,:],label=train.loc[val_indices,'target'])
    
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 128,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc',
        } 
    
    bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
    predictions+=bst.predict(test.drop(['id'],axis=1))
    print('cur fold finished.')
    del bst
    
predictions = predictions/3

submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv(SUBMISSION_FILENAME.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),index=False)

In [None]:
# Preprocess songs data
songs_genres = np.array(songs['genre_ids']\
    .apply(lambda x: [int(v) for v in str(x).split('|')]))
genres_list = songs_genres.ravel().unique()
print('Number of genres: ' + str(len(genres_list)))

ohe_genres = np.zeros((len(songs_genres), len(genres_list)))
for s_i, s_genres in enumerate(songs_genres):
    for genre in s_genres:
        g_i = genres_list.find(genre)
        ohe_genres[s_i, g_i] = 1
        
for g_i, g in enumerate(genres_list):
    songs['genre_' + str(g)] = ohe_genres[:, g_i]
print(songs.head())
songs = songs.drop(['genre_ids'], axis=1)

song_cols = songs.columns

# Preprocess dataset
train = train.fillna(-1)
test = test.fillna(-1)

cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

        print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))


In [None]:
########################################
## import packages
########################################

import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD

########################################
## load the data
########################################

train = pd.read_csv('./data/train.csv')
uid = train.msno
sid = train.song_id
target = train.target

test = pd.read_csv('./data/test.csv')
id_test = test.id
uid_test = test.msno
sid_test = test.song_id

########################################
## encoding
########################################

usr_encoder = LabelEncoder()
usr_encoder.fit(uid.append(uid_test))
uid = usr_encoder.transform(uid)
uid_test = usr_encoder.transform(uid_test)

sid_encoder = LabelEncoder()
sid_encoder.fit(sid.append(sid_test))
sid = sid_encoder.transform(sid)
sid_test = sid_encoder.transform(sid_test)

u_cnt = int(max(uid.max(), uid_test.max()) + 1)
s_cnt = int(max(sid.max(), sid_test.max()) + 1)

########################################
## train-validation split
########################################

perm = np.random.permutation(len(train))
trn_cnt = int(len(train) * 0.85)
uid_trn = uid[perm[:trn_cnt]]
uid_val = uid[perm[trn_cnt:]]
sid_trn = sid[perm[:trn_cnt]]
sid_val = sid[perm[trn_cnt:]]
target_trn = target[perm[:trn_cnt]]
target_val = target[perm[trn_cnt:]]

########################################
## define the model
########################################

def get_model():
    user_embeddings = Embedding(u_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)
    song_embeddings = Embedding(s_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)

    uid_input = Input(shape=(1,), dtype='int32')
    embedded_usr = user_embeddings(uid_input)
    embedded_usr = Reshape((64,))(embedded_usr)

    sid_input = Input(shape=(1,), dtype='int32')
    embedded_song = song_embeddings(sid_input)
    embedded_song = Reshape((64,))(embedded_song)

    preds = dot([embedded_usr, embedded_song], axes=1)
    preds = concatenate([embedded_usr, embedded_song, preds])
    
    preds = Dense(128, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(1, activation='sigmoid')(preds)

    model = Model(inputs=[uid_input, sid_input], outputs=preds)
    
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

    return model

########################################
## train the model
########################################
   
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=5)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
        save_weights_only=True)

hist = model.fit([uid_trn, sid_trn], target_trn, validation_data=([uid_val, sid_val], \
        target_val), epochs=100, batch_size=32768, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])
model.load_weights(model_path)

preds_val = model.predict([uid_val, sid_val], batch_size=32768)
val_auc = roc_auc_score(target_val, preds_val)

########################################
## make the submission
########################################

preds_test = model.predict([uid_test, sid_test], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': id_test, 'target': preds_test.ravel()})
sub.to_csv('./sub_%.5f.csv'%(val_auc), index=False)

In [None]:
# Linear algebra:
import numpy as np
import pandas as pd
# Graphics:
import matplotlib.pyplot as plt
import seaborn as sns  
# Frameworks:
import lightgbm as lgb # LightGBM
# Utils:
import gc # garbage collector
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

IDIR = '../input/' # main path
members = pd.read_csv(IDIR + 'members.csv')
songs = pd.read_csv(IDIR + 'songs.csv')
song_extra_info = pd.read_csv(IDIR + 'song_extra_info.csv')
train = pd.read_csv(IDIR + 'train.csv')
test = pd.read_csv(IDIR + 'test.csv')

# Adding songs' info:
train_aug1 = pd.merge(left=train, right=songs, on='song_id', how='left')
test_aug1 = pd.merge(left=test, right=songs, on='song_id', how='left')
# Adding extra info about songs:
train_aug2 = pd.merge(left=train_aug1, right=song_extra_info, on='song_id', how='left')
test_aug2 = pd.merge(left=test_aug1, right=song_extra_info, on='song_id', how='left')
del train_aug1, test_aug1
# Addind users' info:
train_aug3 = pd.merge(left=train_aug2, right=members, on='msno', how='left')
test_aug3 = pd.merge(left=test_aug2, right=members, on='msno', how='left')
del train_aug2, test_aug2
# Merging train and test data:
train_aug3.drop(['song_id'], axis=1, inplace=True)
train_aug3['set'] = 0
test_aug3.drop(['song_id'], axis=1, inplace=True)
test_aug3['set'] = 1
test_aug3['target'] = -1
all_aug = pd.concat([train_aug3, test_aug3], axis=0)
del train_aug3, test_aug3
gc.collect();



# source_system_tab/source_screen_name/source_type/genre_ids/artist_name/composer/lyricist/name/isrc/gender 用'NA'填补并one-hot编码
# genre_ids encoding:
all_aug['genre_ids'] = all_aug.genre_ids.fillna('NA')
all_aug['genre_ids'] = all_aug.genre_ids.astype(np.str)
genre_ids_le = LabelEncoder()
genre_ids_le.fit(all_aug.genre_ids)
all_aug['genre_ids'] = genre_ids_le.transform(all_aug.genre_ids).astype(np.int16)

# language encoding:
all_aug['language'] = all_aug.language.fillna(-2)
all_aug['language'] = all_aug.language.astype(np.int8)

# city encoding:
all_aug['city'] = all_aug.city.astype(np.int8)
# bd encoding:
all_aug['bd'] = all_aug.bd.astype(np.int16)

# registered_via encoding:
all_aug['registered_via'] = all_aug.registered_via.astype(np.int8)
# registration_init_time encoding:
all_aug['registration_init_time'] = all_aug.registration_init_time.astype(np.int32)
# expiration_date encoding:
all_aug['expiration_date'] = all_aug.expiration_date.astype(np.int32)
# Info:
all_aug.info(max_cols=0)
all_aug.head(2)


all_aug['exp_reg_time'] = all_aug.expiration_date - all_aug.registration_init_time



gc.collect();
d_train = lgb.Dataset(all_aug[all_aug.set == 0].drop(['target', 'msno', 'id', 'set'], axis=1), 
                      label=all_aug[all_aug.set == 0].pop('target'))
ids_train = all_aug[all_aug.set == 0].pop('msno')

lgb_params = {
    'learning_rate': 1.0,
    'max_depth': 15,
    'num_leaves': 250, 
    'objective': 'binary',
    'metric': {'auc'},
    'feature_fraction': 0.8,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'max_bin': 100}
cv_result_lgb = lgb.cv(lgb_params, 
                       d_train, 
                       num_boost_round=5000, 
                       nfold=3, 
                       stratified=True, 
                       early_stopping_rounds=50, 
                       verbose_eval=100, 
                       show_stdv=True)

num_boost_rounds_lgb = len(cv_result_lgb['auc-mean'])
print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))



%%time
ROUNDS = num_boost_rounds_lgb
print('light GBM train :-)')
bst = lgb.train(lgb_params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
# del d_train
gc.collect()


plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
feature_imp = pd.Series(dict(zip(d_train.feature_name, 
                                 bst.feature_importance()))).sort_values(ascending=False)
sns.barplot(x=feature_imp.values, y=feature_imp.index.values, orient='h', color='g')
plt.subplot(1,2,2)
train_scores = np.array(cv_result_lgb['auc-mean'])
train_stds = np.array(cv_result_lgb['auc-stdv'])
plt.plot(train_scores, color='green')
plt.fill_between(range(len(cv_result_lgb['auc-mean'])), 
                 train_scores - train_stds, train_scores + train_stds, 
                 alpha=0.1, color='green')
plt.title('LightGMB CV-results')
plt.show()