In [19]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
from lib import load_train, load_test

In [4]:
train = load_train(nrows=100000)
test = load_test()

In [5]:
test_supplement = pd.read_csv('data/test_supplement.csv')

In [16]:
train.groupby('ip').agg({'click_time': np.fft})

TypeError: 'module' object is not callable

In [3]:
def process_date(df, col):
    df['%s_day'%col] = df[col].apply(lambda x: x.day)
    df['%s_hour'%col] = df[col].apply(lambda x: x.hour)
    df['%s_minute'%col] = df[col].apply(lambda x: x.minute)
    df['%s_second'%col] = df[col].apply(lambda x: x.second)
    
    return df

In [9]:
train = process_date(train, 'click_time')
test = process_date(test, 'click_time')

NameError: name 'process_date' is not defined

In [None]:
X = train[['ip', 'app', 'device', 'os', 'channel']]
y = train['is_attributed']

In [10]:
KFOLD = 10
tscv = TimeSeriesSplit(n_splits=KFOLD)

rocs = []
for train_index, val_index in tscv.split(train):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]


    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # specify your configurations as a dict
#     params = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': {'l2', 'auc'},
#         'num_leaves': 31,
#         'learning_rate': 0.05,
#         'feature_fraction': 0.9,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 5,
#         'verbose': 0
#     }

    params = {
        'learning_rate': 0.05,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 31,  # 2^max_depth - 1
        'max_depth': 4,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        #'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':99, # because training data is extremely unbalanced 
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        
        'metric': {'l2', 'auc'}
    }

    print('Start training...')
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5)

    print('Start predicting...')
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    roc = roc_auc_score(y_val, y_pred)
    
    print("ROC AUC = %f"%roc)
    rocs.append(roc)

print("Average ROC AUC = %f"%np.mean(rocs))

Start training...




[1]	valid_0's auc: 0.826955	valid_0's l2: 0.00166723
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.847834	valid_0's l2: 0.00163383
[3]	valid_0's auc: 0.847728	valid_0's l2: 0.00161284
[4]	valid_0's auc: 0.847649	valid_0's l2: 0.00159361
[5]	valid_0's auc: 0.847627	valid_0's l2: 0.00157815
[6]	valid_0's auc: 0.850269	valid_0's l2: 0.00155296
[7]	valid_0's auc: 0.853458	valid_0's l2: 0.00152945
[8]	valid_0's auc: 0.854005	valid_0's l2: 0.00151721
[9]	valid_0's auc: 0.871072	valid_0's l2: 0.00149669
[10]	valid_0's auc: 0.871076	valid_0's l2: 0.00148004
[11]	valid_0's auc: 0.870735	valid_0's l2: 0.00146957
[12]	valid_0's auc: 0.870737	valid_0's l2: 0.00145383
[13]	valid_0's auc: 0.870731	valid_0's l2: 0.00144507
[14]	valid_0's auc: 0.870684	valid_0's l2: 0.00143736
[15]	valid_0's auc: 0.870686	valid_0's l2: 0.00142409
Early stopping, best iteration is:
[10]	valid_0's auc: 0.871076	valid_0's l2: 0.00148004
Start predicting...
ROC AUC = 0.871076
Start trai