In [1]:
import os, time
start = time.time()
very_start = time.time()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [4]:

client = Client(n_workers=1, 
                       threads_per_worker=96,
                       memory_limit='480GB',ip='10.1.0.xxx')
dask.config.set(shuffle='disk')
dask.config.set({'temporary_directory': '/path/to/dask_tmp'})

<dask.config.set at 0x7f5e76409190>

# Load Train

In [5]:
%%time
path = '/path/to/bin/tmp/'
train = dd.read_parquet(f'/path/to/dask_tmp/train_10_raw/*.parquet')#,dtypes=dtypes)
valid = dd.read_parquet(f'/path/to/dask_tmp/valid_10_raw/*.parquet')#,dtypes=dtypes)

CPU times: user 106 ms, sys: 14.5 ms, total: 121 ms
Wall time: 112 ms


In [6]:
cols_drop = ['tweet','tweet_nortsign']
train = train.drop(cols_drop,axis=1)
valid = valid.drop(cols_drop,axis=1)

In [7]:
dtype_names = ['count_words', 'count_char', 'count_ats']
for col in dtype_names:
    train[col] = train[col].astype('int32')
    valid[col] = valid[col].astype('int32')

In [8]:
label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
DONT_USE = ['tweet_id','tweet_timestamp','engaged_with_user_account_creation','enaging_user_account_creation','engage_time',
            'fold','enaging_user_id','engaged_with_user_id', 'dt_dow',
            'engaged_with_user_account_creation', 'enaging_user_account_creation', 'elapsed_time',
             'present_links','present_domains']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

# Train Model Validate
We will train on random 10% of first 5 days and validation on last 2 days

In [9]:
%%time

SAMPLE_RATIO = 0.5
SEED = 1

if SAMPLE_RATIO < 1.0:
    print(len(train))
    train = train.sample(frac=SAMPLE_RATIO,random_state=42)
    train, = dask.persist(train)
    train.head()
    print(len(train))

train = train.compute()
# train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)

features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

21749425
10874713
Using 121 features: 121
CPU times: user 11.2 s, sys: 12.6 s, total: 23.8 s
Wall time: 2min 41s


array(['hashtags', 'present_media', 'tweet_type', 'language',
       'engaged_with_user_follower_count',
       'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'enaging_user_follower_count',
       'enaging_user_following_count', 'enaging_user_is_verified',
       'engagee_follows_engager', 'len_hashtags', 'len_domains',
       'len_links', 'dt_hour', 'dt_minute', 'dt_second', 'count_words',
       'count_char', 'tw_uhash', 'tw_hash', 'count_ats', 'hash0', 'hash1',
       'tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word',
       'tw_llast_word', 'tw_len', 'TE_present_media_reply_timestamp',
       'TE_tweet_type_reply_timestamp', 'TE_language_reply_timestamp',
       'TE_engaged_with_user_id_reply_timestamp',
       'TE_enaging_user_id_reply_timestamp',
       'TE_present_media_retweet_timestamp',
       'TE_tweet_type_retweet_timestamp', 'TE_language_retweet_timestamp',
       'TE_engaged_with_user_id_retweet_timestamp',
       'TE_enaging_

In [10]:
SAMPLE_RATIO = 0.052 # VAL SET NOW SIZE OF TEST SET
SEED = 1
if SAMPLE_RATIO < 1.0:
    print(len(valid))
    valid = valid.sample(frac=SAMPLE_RATIO,random_state=42)
    valid, = dask.persist(valid)
    valid.head()
    print(len(valid))
    
valid = valid.compute()
Y_valid = valid[label_names]
valid = valid.drop(label_names,axis=1)

8585006
446420


In [31]:
import sys
sys.path.append("..")
from Models.GBM.LightGBM import LightGBM
params = {
    'num_leaves': 61.35765882069168,
    'learning_rate': 0.07266779287696,
    'max_depth': 28,
    'lambda_l1': 50.0,
    'lambda_l2': 50.0,
    'colsample_bynode': 0.8,
    'colsample_bytree': 0.4,
    'bagging_fraction': 0.8,
    'bagging_freq': 7,
    'max_bin': 163.51837199855655,
    'min_data_in_leaf': 1282.3912530172006
}

LGBM0 = LightGBM(
    objective='binary',
    num_threads=94,
    num_iterations=1500,
    early_stopping_rounds=20,
    **params,
)
LGBM1 = LightGBM(
    objective='binary',
    num_threads=94,
    num_iterations=1500,
    early_stopping_rounds=20,
    **params,
)
LGBM2 = LightGBM(
    objective='binary',
    num_threads=94,
    num_iterations=1500,
    early_stopping_rounds=20,
    **params,
)
LGBM3 = LightGBM(
    objective='binary',
    num_threads=94,
    num_iterations=1500,
    early_stopping_rounds=20,
    **params,
)
model2 = [LGBM0,LGBM1,LGBM2,LGBM3]

In [32]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {valid.shape}')

no dup :) 
X_train.shape (10874713, 121)
X_valid.shape (446420, 121)


In [33]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        valid[col] = valid[col].astype('int8')

CPU times: user 1.25 ms, sys: 2 µs, total: 1.25 ms
Wall time: 1.27 ms


In [34]:
train.head()

Unnamed: 0,hashtags,present_media,tweet_type,language,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,engagee_follows_engager,len_hashtags,len_domains,len_links,dt_hour,dt_minute,dt_second,count_words,count_char,tw_uhash,tw_hash,count_ats,hash0,hash1,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,TE_present_media_reply_timestamp,TE_tweet_type_reply_timestamp,TE_language_reply_timestamp,TE_engaged_with_user_id_reply_timestamp,TE_enaging_user_id_reply_timestamp,TE_present_media_retweet_timestamp,TE_tweet_type_retweet_timestamp,TE_language_retweet_timestamp,TE_engaged_with_user_id_retweet_timestamp,TE_enaging_user_id_retweet_timestamp,TE_present_media_retweet_with_comment_timestamp,TE_tweet_type_retweet_with_comment_timestamp,TE_language_retweet_with_comment_timestamp,TE_engaged_with_user_id_retweet_with_comment_timestamp,TE_enaging_user_id_retweet_with_comment_timestamp,TE_present_media_like_timestamp,TE_tweet_type_like_timestamp,TE_language_like_timestamp,TE_engaged_with_user_id_like_timestamp,TE_enaging_user_id_like_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp,TE_engaged_with_user_id_tweet_type_language_reply_timestamp,TE_tw_first_word_tweet_type_language_reply_timestamp,TE_tw_last_word_tweet_type_language_reply_timestamp,TE_hash0_tweet_type_language_reply_timestamp,TE_hash1_tweet_type_language_reply_timestamp,TE_tw_uhash_tweet_type_language_reply_timestamp,TE_tw_hash_reply_timestamp,TE_tw_freq_hash_reply_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_reply_timestamp,TE_present_domains_present_media_tweet_type_language_reply_timestamp,TE_present_links_present_media_tweet_type_language_reply_timestamp,TE_hashtags_present_media_tweet_type_language_reply_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp,TE_engaged_with_user_id_tweet_type_language_retweet_timestamp,TE_tw_first_word_tweet_type_language_retweet_timestamp,TE_tw_last_word_tweet_type_language_retweet_timestamp,TE_hash0_tweet_type_language_retweet_timestamp,TE_hash1_tweet_type_language_retweet_timestamp,TE_tw_uhash_tweet_type_language_retweet_timestamp,TE_tw_hash_retweet_timestamp,TE_tw_freq_hash_retweet_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_retweet_timestamp,TE_present_domains_present_media_tweet_type_language_retweet_timestamp,TE_present_links_present_media_tweet_type_language_retweet_timestamp,TE_hashtags_present_media_tweet_type_language_retweet_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp,TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp,TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp,TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp,TE_hash0_tweet_type_language_retweet_with_comment_timestamp,TE_hash1_tweet_type_language_retweet_with_comment_timestamp,TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp,TE_tw_hash_retweet_with_comment_timestamp,TE_tw_freq_hash_retweet_with_comment_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp,TE_present_domains_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_present_links_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp,TE_engaged_with_user_id_tweet_type_language_like_timestamp,TE_tw_first_word_tweet_type_language_like_timestamp,TE_tw_last_word_tweet_type_language_like_timestamp,TE_hash0_tweet_type_language_like_timestamp,TE_hash1_tweet_type_language_like_timestamp,TE_tw_uhash_tweet_type_language_like_timestamp,TE_tw_hash_like_timestamp,TE_tw_freq_hash_like_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_like_timestamp,TE_present_domains_present_media_tweet_type_language_like_timestamp,TE_present_links_present_media_tweet_type_language_like_timestamp,TE_hashtags_present_media_tweet_type_language_like_timestamp,TE_present_media_elapsed_time,TE_tweet_type_elapsed_time,TE_language_elapsed_time,CE_present_media,CE_tweet_type,CE_language,CE_engaged_with_user_id,CE_enaging_user_id,CE_present_media_norm,CE_tweet_type_norm,CE_language_norm,CE_engaged_with_user_id_norm,CE_enaging_user_id_norm,DE_enaging_user_id_enaging_user_following_count_1,DE_enaging_user_id_enaging_user_following_count_-1,DE_enaging_user_id_language_1,DE_enaging_user_id_language_-1,a_ff_rate,b_ff_rate
1971849,0,0.0,1,0,334126,18414,0,33,39,0,0,0,0,0,16,31,11,33,170,1799,569692180,1,1799,0,569692180,924731,791917,1090,1090,14,0.032645,0.005752,0.028732,0.027621,0.0295,0.07928,0.112284,0.079864,0.10848,0.086867,0.006447,0.006206,0.007803,0.000457,0.00711,0.364952,0.246367,0.379281,0.343649,0.393745,0.006945,0.014197,0.0295,0.007291,0.010157,0.00657,0.010157,0.0295,0.0295,0.006809,0.007119,0.007119,0.007318,0.082877,0.015512,0.086867,0.091097,0.191127,0.094193,0.191127,0.086867,0.086867,0.080529,0.086885,0.086885,0.084484,0.004806,0.00127,0.00711,0.006127,0.004479,0.006772,0.004479,0.00711,0.00711,0.004662,0.005856,0.005856,0.005944,0.244193,0.168526,0.393745,0.226908,0.234804,0.279081,0.234804,0.393745,0.393745,0.236954,0.236088,0.236088,0.242079,1.2e-05,7e-06,9e-06,18765501.0,9965234,10957387,434.0,1.0,0.620148,0.329896,0.365925,1.526477e-05,4.597823e-08,-0.0,-0.0,-0.0,-0.0,0.055111,0.846154
1571286,0,5.0,1,4,490,529,0,75,378,0,1,0,0,0,8,51,18,7,56,6791,-279645608,1,6791,0,-279645608,1007816,155146,94416,94416,3,0.028661,0.005752,0.019377,0.026819,0.028096,0.087643,0.112284,0.082539,0.07897,0.082731,0.007457,0.006206,0.006672,0.006464,0.006771,0.449325,0.246367,0.415885,0.35795,0.374995,0.004904,0.0295,0.0295,0.004876,0.004917,0.004257,0.004917,0.0295,0.0295,0.004934,0.004565,0.004565,0.004755,0.109182,0.086867,0.086867,0.080474,0.081145,0.09917,0.081145,0.086867,0.086867,0.109122,0.09602,0.09602,0.095936,0.006926,0.00711,0.00711,0.00944,0.009518,0.006076,0.009518,0.00711,0.00711,0.006878,0.005263,0.005263,0.005506,0.335224,0.393745,0.393745,0.362603,0.365624,0.311002,0.365624,0.393745,0.393745,0.334552,0.324611,0.324611,0.330727,1.2e-05,7e-06,9e-06,6037524.0,9965234,1949730,4.0,3.0,0.19891,0.329896,0.063731,1.839129e-07,1.379347e-07,0.0,0.0,0.0,0.0,1.079592,0.198413
2062484,0,7.0,2,4,243,253,0,794,1002,0,1,0,0,0,18,8,50,2,35,0,-506757606,0,0,0,-506757606,-1,-1,-1,-1,0,0.020536,0.04233,0.019377,0.0295,0.028096,0.115903,0.077514,0.082539,0.086867,0.082731,0.008462,0.007917,0.006672,0.00711,0.006771,0.492777,0.482271,0.415885,0.393745,0.374995,0.05174,0.0295,0.03232,0.03232,0.025486,0.02722,0.025486,0.0295,0.0295,0.051742,0.016732,0.016732,0.017758,0.096642,0.086867,0.073602,0.073602,0.081918,0.081671,0.081918,0.086867,0.086867,0.097437,0.108256,0.108256,0.099966,0.013168,0.00711,0.007603,0.007603,0.007703,0.007572,0.007703,0.00711,0.00711,0.01321,0.008756,0.008756,0.009055,0.656567,0.393745,0.50578,0.50578,0.517565,0.523603,0.517565,0.393745,0.393745,0.655696,0.598832,0.598832,0.608145,1.1e-05,1.3e-05,9e-06,1705608.0,17904537,1949730,2.0,2.0,0.055562,0.587231,0.063731,9.195645e-08,9.195645e-08,0.0,0.0,0.0,-0.0,1.041152,0.792415
1419726,0,0.0,1,0,2207,572,0,2171,1363,0,1,0,0,0,6,19,5,17,107,2507,-214581439,1,2507,0,-214581439,796311,543910,93917,93917,5,0.032645,0.005752,0.028732,0.019033,0.026819,0.07928,0.112284,0.079864,0.185076,0.124425,0.006447,0.006206,0.007803,0.004587,0.006464,0.364952,0.246367,0.379281,0.479836,0.35795,0.008158,0.020345,0.0295,0.026819,0.004758,0.00657,0.004758,0.0295,0.0295,0.008161,0.007119,0.007119,0.007318,0.117667,0.163357,0.086867,0.07897,0.159172,0.094193,0.159172,0.086867,0.086867,0.116344,0.086885,0.086885,0.084484,0.008168,0.004903,0.00711,0.006464,0.017276,0.006772,0.017276,0.00711,0.00711,0.008109,0.005856,0.005856,0.005944,0.285034,0.478445,0.393745,0.35795,0.329636,0.279081,0.329636,0.393745,0.393745,0.280565,0.236088,0.236088,0.242079,1.2e-05,7e-06,9e-06,18765501.0,9965234,10957387,20.0,4.0,0.620148,0.329896,0.365925,7.356516e-07,1.379347e-07,0.0,0.0,-0.0,0.0,0.259175,1.59281
1416932,0,5.0,1,1,201,392,0,14,33,0,1,0,0,0,6,16,27,13,75,51238,448048590,1,51238,0,448048590,640219,633158,1080,1080,7,0.028661,0.005752,0.026728,0.026819,0.028096,0.087643,0.112284,0.055023,0.07897,0.082731,0.007457,0.006206,0.003553,0.006464,0.006771,0.449325,0.246367,0.419006,0.448859,0.422614,0.004478,0.028096,0.0295,0.005209,0.0295,0.003646,0.0295,0.0295,0.0295,0.004331,0.003759,0.003759,0.00406,0.089306,0.082731,0.086867,0.07331,0.086867,0.080555,0.086867,0.086867,0.086867,0.08892,0.082284,0.082284,0.087803,0.003817,0.006771,0.00711,0.001693,0.00711,0.003964,0.00711,0.00711,0.00711,0.003954,0.003115,0.003115,0.003566,0.328778,0.422614,0.393745,0.272628,0.393745,0.257394,0.393745,0.393745,0.393745,0.321286,0.333343,0.333343,0.339689,1.2e-05,7e-06,1.9e-05,6037524.0,9965234,5205178,4.0,2.0,0.19891,0.329896,0.169873,1.379347e-07,9.195645e-08,-0.0,-0.0,-0.0,-0.0,1.950249,0.424242


In [35]:
#label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [36]:
%%time
# TRAIN AND VALIDATE

NROUND = 500
VERBOSE_EVAL = 50
#ESR = 50
LR = [0.1,0.03,0.07,0.01]

#Like
#xgb_parms['learning_rate'] = LR[TARGET_id]    
oof = np.zeros((len(valid),len(label_names)))
preds = []

for i in range(4):
    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)

    start = time.time(); print('Training...')
#     model[i].fit(X=train, Y=Y_train.iloc[:, i],
#          categorical_feature=set([]))
    model2[i].fit(X=train, Y=Y_train.iloc[:, i], X_val=valid, Y_val=Y_valid.iloc[:, i],
         categorical_feature=set([]))
    print('Training took %.1f seconds'%(time.time()-start))
        
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, i] += model2[i].get_prediction(valid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))

#########################
### reply_timestamp
#########################
Training...




[1]	valid_0's binary_logloss: 0.130751
Training until validation scores don't improve for 20 rounds
[2]	valid_0's binary_logloss: 0.127487
[3]	valid_0's binary_logloss: 0.125697
[4]	valid_0's binary_logloss: 0.123971
[5]	valid_0's binary_logloss: 0.12255
[6]	valid_0's binary_logloss: 0.121293
[7]	valid_0's binary_logloss: 0.119757
[8]	valid_0's binary_logloss: 0.118834
[9]	valid_0's binary_logloss: 0.118142
[10]	valid_0's binary_logloss: 0.117543
[11]	valid_0's binary_logloss: 0.116897
[12]	valid_0's binary_logloss: 0.116358
[13]	valid_0's binary_logloss: 0.11588
[14]	valid_0's binary_logloss: 0.115477
[15]	valid_0's binary_logloss: 0.114737
[16]	valid_0's binary_logloss: 0.114123
[17]	valid_0's binary_logloss: 0.113837
[18]	valid_0's binary_logloss: 0.113549
[19]	valid_0's binary_logloss: 0.113044
[20]	valid_0's binary_logloss: 0.112627
[21]	valid_0's binary_logloss: 0.112236
[22]	valid_0's binary_logloss: 0.112046
[23]	valid_0's binary_logloss: 0.111872
[24]	valid_0's binary_logloss:

[203]	valid_0's binary_logloss: 0.107188
[204]	valid_0's binary_logloss: 0.107183
[205]	valid_0's binary_logloss: 0.10718
[206]	valid_0's binary_logloss: 0.107177
[207]	valid_0's binary_logloss: 0.107175
[208]	valid_0's binary_logloss: 0.107172
[209]	valid_0's binary_logloss: 0.107167
[210]	valid_0's binary_logloss: 0.107164
[211]	valid_0's binary_logloss: 0.107162
[212]	valid_0's binary_logloss: 0.107159
[213]	valid_0's binary_logloss: 0.107157
[214]	valid_0's binary_logloss: 0.107155
[215]	valid_0's binary_logloss: 0.107151
[216]	valid_0's binary_logloss: 0.107148
[217]	valid_0's binary_logloss: 0.107146
[218]	valid_0's binary_logloss: 0.107143
[219]	valid_0's binary_logloss: 0.107141
[220]	valid_0's binary_logloss: 0.107139
[221]	valid_0's binary_logloss: 0.107137
[222]	valid_0's binary_logloss: 0.107134
[223]	valid_0's binary_logloss: 0.107131
[224]	valid_0's binary_logloss: 0.107127
[225]	valid_0's binary_logloss: 0.107126
[226]	valid_0's binary_logloss: 0.107122
[227]	valid_0's b

[404]	valid_0's binary_logloss: 0.10679
[405]	valid_0's binary_logloss: 0.10679
[406]	valid_0's binary_logloss: 0.106789
[407]	valid_0's binary_logloss: 0.106787
[408]	valid_0's binary_logloss: 0.106786
[409]	valid_0's binary_logloss: 0.106785
[410]	valid_0's binary_logloss: 0.106784
[411]	valid_0's binary_logloss: 0.106785
[412]	valid_0's binary_logloss: 0.106783
[413]	valid_0's binary_logloss: 0.106781
[414]	valid_0's binary_logloss: 0.10678
[415]	valid_0's binary_logloss: 0.106779
[416]	valid_0's binary_logloss: 0.106778
[417]	valid_0's binary_logloss: 0.106778
[418]	valid_0's binary_logloss: 0.106777
[419]	valid_0's binary_logloss: 0.106777
[420]	valid_0's binary_logloss: 0.106776
[421]	valid_0's binary_logloss: 0.106773
[422]	valid_0's binary_logloss: 0.106773
[423]	valid_0's binary_logloss: 0.106773
[424]	valid_0's binary_logloss: 0.106771
[425]	valid_0's binary_logloss: 0.10677
[426]	valid_0's binary_logloss: 0.10677
[427]	valid_0's binary_logloss: 0.10677
[428]	valid_0's binary

[605]	valid_0's binary_logloss: 0.106639
[606]	valid_0's binary_logloss: 0.106639
[607]	valid_0's binary_logloss: 0.106639
[608]	valid_0's binary_logloss: 0.106639
[609]	valid_0's binary_logloss: 0.106638
[610]	valid_0's binary_logloss: 0.106638
[611]	valid_0's binary_logloss: 0.106636
[612]	valid_0's binary_logloss: 0.106634
[613]	valid_0's binary_logloss: 0.106635
[614]	valid_0's binary_logloss: 0.106634
[615]	valid_0's binary_logloss: 0.106633
[616]	valid_0's binary_logloss: 0.106633
[617]	valid_0's binary_logloss: 0.106632
[618]	valid_0's binary_logloss: 0.10663
[619]	valid_0's binary_logloss: 0.10663
[620]	valid_0's binary_logloss: 0.106629
[621]	valid_0's binary_logloss: 0.10663
[622]	valid_0's binary_logloss: 0.106629
[623]	valid_0's binary_logloss: 0.106628
[624]	valid_0's binary_logloss: 0.106627
[625]	valid_0's binary_logloss: 0.106626
[626]	valid_0's binary_logloss: 0.106626
[627]	valid_0's binary_logloss: 0.106624
[628]	valid_0's binary_logloss: 0.106625
[629]	valid_0's bin

[806]	valid_0's binary_logloss: 0.106545
[807]	valid_0's binary_logloss: 0.106545
[808]	valid_0's binary_logloss: 0.106545
[809]	valid_0's binary_logloss: 0.106545
[810]	valid_0's binary_logloss: 0.106545
[811]	valid_0's binary_logloss: 0.106545
[812]	valid_0's binary_logloss: 0.106544
[813]	valid_0's binary_logloss: 0.106543
[814]	valid_0's binary_logloss: 0.106543
[815]	valid_0's binary_logloss: 0.106543
[816]	valid_0's binary_logloss: 0.106542
[817]	valid_0's binary_logloss: 0.106542
[818]	valid_0's binary_logloss: 0.106542
[819]	valid_0's binary_logloss: 0.106541
[820]	valid_0's binary_logloss: 0.106541
[821]	valid_0's binary_logloss: 0.106542
[822]	valid_0's binary_logloss: 0.10654
[823]	valid_0's binary_logloss: 0.106541
[824]	valid_0's binary_logloss: 0.106541
[825]	valid_0's binary_logloss: 0.106541
[826]	valid_0's binary_logloss: 0.10654
[827]	valid_0's binary_logloss: 0.10654
[828]	valid_0's binary_logloss: 0.106539
[829]	valid_0's binary_logloss: 0.106539
[830]	valid_0's bin

[1007]	valid_0's binary_logloss: 0.106492
[1008]	valid_0's binary_logloss: 0.106492
[1009]	valid_0's binary_logloss: 0.106492
[1010]	valid_0's binary_logloss: 0.106491
[1011]	valid_0's binary_logloss: 0.106491
[1012]	valid_0's binary_logloss: 0.106491
[1013]	valid_0's binary_logloss: 0.10649
[1014]	valid_0's binary_logloss: 0.106489
[1015]	valid_0's binary_logloss: 0.10649
[1016]	valid_0's binary_logloss: 0.106489
[1017]	valid_0's binary_logloss: 0.106489
[1018]	valid_0's binary_logloss: 0.106488
[1019]	valid_0's binary_logloss: 0.106487
[1020]	valid_0's binary_logloss: 0.106487
[1021]	valid_0's binary_logloss: 0.106486
[1022]	valid_0's binary_logloss: 0.106487
[1023]	valid_0's binary_logloss: 0.106486
[1024]	valid_0's binary_logloss: 0.106487
[1025]	valid_0's binary_logloss: 0.106486
[1026]	valid_0's binary_logloss: 0.106485
[1027]	valid_0's binary_logloss: 0.106485
[1028]	valid_0's binary_logloss: 0.106486
[1029]	valid_0's binary_logloss: 0.106486
[1030]	valid_0's binary_logloss: 0.1

[81]	valid_0's binary_logloss: 0.234837
[82]	valid_0's binary_logloss: 0.234789
[83]	valid_0's binary_logloss: 0.234736
[84]	valid_0's binary_logloss: 0.234704
[85]	valid_0's binary_logloss: 0.234646
[86]	valid_0's binary_logloss: 0.234596
[87]	valid_0's binary_logloss: 0.234556
[88]	valid_0's binary_logloss: 0.234514
[89]	valid_0's binary_logloss: 0.234471
[90]	valid_0's binary_logloss: 0.234422
[91]	valid_0's binary_logloss: 0.234379
[92]	valid_0's binary_logloss: 0.23434
[93]	valid_0's binary_logloss: 0.234328
[94]	valid_0's binary_logloss: 0.234288
[95]	valid_0's binary_logloss: 0.234254
[96]	valid_0's binary_logloss: 0.234213
[97]	valid_0's binary_logloss: 0.234188
[98]	valid_0's binary_logloss: 0.234149
[99]	valid_0's binary_logloss: 0.234119
[100]	valid_0's binary_logloss: 0.234088
[101]	valid_0's binary_logloss: 0.234055
[102]	valid_0's binary_logloss: 0.234024
[103]	valid_0's binary_logloss: 0.233996
[104]	valid_0's binary_logloss: 0.233963
[105]	valid_0's binary_logloss: 0.23

[282]	valid_0's binary_logloss: 0.231507
[283]	valid_0's binary_logloss: 0.231501
[284]	valid_0's binary_logloss: 0.231494
[285]	valid_0's binary_logloss: 0.231488
[286]	valid_0's binary_logloss: 0.23148
[287]	valid_0's binary_logloss: 0.231476
[288]	valid_0's binary_logloss: 0.231469
[289]	valid_0's binary_logloss: 0.231461
[290]	valid_0's binary_logloss: 0.231455
[291]	valid_0's binary_logloss: 0.231449
[292]	valid_0's binary_logloss: 0.231442
[293]	valid_0's binary_logloss: 0.231442
[294]	valid_0's binary_logloss: 0.231439
[295]	valid_0's binary_logloss: 0.231434
[296]	valid_0's binary_logloss: 0.231428
[297]	valid_0's binary_logloss: 0.231426
[298]	valid_0's binary_logloss: 0.231438
[299]	valid_0's binary_logloss: 0.231432
[300]	valid_0's binary_logloss: 0.231423
[301]	valid_0's binary_logloss: 0.231416
[302]	valid_0's binary_logloss: 0.23141
[303]	valid_0's binary_logloss: 0.231405
[304]	valid_0's binary_logloss: 0.231397
[305]	valid_0's binary_logloss: 0.231395
[306]	valid_0's bi

[483]	valid_0's binary_logloss: 0.23087
[484]	valid_0's binary_logloss: 0.230869
[485]	valid_0's binary_logloss: 0.230867
[486]	valid_0's binary_logloss: 0.230864
[487]	valid_0's binary_logloss: 0.23086
[488]	valid_0's binary_logloss: 0.230857
[489]	valid_0's binary_logloss: 0.230855
[490]	valid_0's binary_logloss: 0.230861
[491]	valid_0's binary_logloss: 0.230858
[492]	valid_0's binary_logloss: 0.230855
[493]	valid_0's binary_logloss: 0.230854
[494]	valid_0's binary_logloss: 0.230852
[495]	valid_0's binary_logloss: 0.230848
[496]	valid_0's binary_logloss: 0.230847
[497]	valid_0's binary_logloss: 0.230845
[498]	valid_0's binary_logloss: 0.230844
[499]	valid_0's binary_logloss: 0.23084
[500]	valid_0's binary_logloss: 0.230834
[501]	valid_0's binary_logloss: 0.230834
[502]	valid_0's binary_logloss: 0.230831
[503]	valid_0's binary_logloss: 0.230831
[504]	valid_0's binary_logloss: 0.230828
[505]	valid_0's binary_logloss: 0.230825
[506]	valid_0's binary_logloss: 0.230824
[507]	valid_0's bin

[684]	valid_0's binary_logloss: 0.230598
[685]	valid_0's binary_logloss: 0.230597
[686]	valid_0's binary_logloss: 0.230595
[687]	valid_0's binary_logloss: 0.230594
[688]	valid_0's binary_logloss: 0.230591
[689]	valid_0's binary_logloss: 0.23059
[690]	valid_0's binary_logloss: 0.230588
[691]	valid_0's binary_logloss: 0.230586
[692]	valid_0's binary_logloss: 0.230583
[693]	valid_0's binary_logloss: 0.230581
[694]	valid_0's binary_logloss: 0.230579
[695]	valid_0's binary_logloss: 0.230578
[696]	valid_0's binary_logloss: 0.230576
[697]	valid_0's binary_logloss: 0.230575
[698]	valid_0's binary_logloss: 0.230574
[699]	valid_0's binary_logloss: 0.230573
[700]	valid_0's binary_logloss: 0.230571
[701]	valid_0's binary_logloss: 0.23057
[702]	valid_0's binary_logloss: 0.230568
[703]	valid_0's binary_logloss: 0.230567
[704]	valid_0's binary_logloss: 0.230571
[705]	valid_0's binary_logloss: 0.230569
[706]	valid_0's binary_logloss: 0.230568
[707]	valid_0's binary_logloss: 0.230565
[708]	valid_0's bi

[885]	valid_0's binary_logloss: 0.230418
[886]	valid_0's binary_logloss: 0.230419
[887]	valid_0's binary_logloss: 0.230419
[888]	valid_0's binary_logloss: 0.230418
[889]	valid_0's binary_logloss: 0.230417
[890]	valid_0's binary_logloss: 0.230417
[891]	valid_0's binary_logloss: 0.230418
[892]	valid_0's binary_logloss: 0.230416
[893]	valid_0's binary_logloss: 0.230415
[894]	valid_0's binary_logloss: 0.230416
[895]	valid_0's binary_logloss: 0.230415
[896]	valid_0's binary_logloss: 0.230415
[897]	valid_0's binary_logloss: 0.230416
[898]	valid_0's binary_logloss: 0.230415
[899]	valid_0's binary_logloss: 0.230418
[900]	valid_0's binary_logloss: 0.230416
[901]	valid_0's binary_logloss: 0.230416
[902]	valid_0's binary_logloss: 0.230415
[903]	valid_0's binary_logloss: 0.230413
[904]	valid_0's binary_logloss: 0.230413
[905]	valid_0's binary_logloss: 0.230412
[906]	valid_0's binary_logloss: 0.230411
[907]	valid_0's binary_logloss: 0.230411
[908]	valid_0's binary_logloss: 0.23041
[909]	valid_0's b

[37]	valid_0's binary_logloss: 0.037729
[38]	valid_0's binary_logloss: 0.0377148
[39]	valid_0's binary_logloss: 0.0376992
[40]	valid_0's binary_logloss: 0.0376835
[41]	valid_0's binary_logloss: 0.0376613
[42]	valid_0's binary_logloss: 0.0376489
[43]	valid_0's binary_logloss: 0.0376398
[44]	valid_0's binary_logloss: 0.0376225
[45]	valid_0's binary_logloss: 0.0376099
[46]	valid_0's binary_logloss: 0.0375963
[47]	valid_0's binary_logloss: 0.0375799
[48]	valid_0's binary_logloss: 0.0375671
[49]	valid_0's binary_logloss: 0.0375564
[50]	valid_0's binary_logloss: 0.0375454
[51]	valid_0's binary_logloss: 0.0375284
[52]	valid_0's binary_logloss: 0.037518
[53]	valid_0's binary_logloss: 0.0375046
[54]	valid_0's binary_logloss: 0.0374911
[55]	valid_0's binary_logloss: 0.0374802
[56]	valid_0's binary_logloss: 0.0374728
[57]	valid_0's binary_logloss: 0.0374664
[58]	valid_0's binary_logloss: 0.0374564
[59]	valid_0's binary_logloss: 0.0374471
[60]	valid_0's binary_logloss: 0.0374383
[61]	valid_0's bin

[235]	valid_0's binary_logloss: 0.0370119
[236]	valid_0's binary_logloss: 0.0370107
[237]	valid_0's binary_logloss: 0.0370107
[238]	valid_0's binary_logloss: 0.0370102
[239]	valid_0's binary_logloss: 0.0370109
[240]	valid_0's binary_logloss: 0.0370096
[241]	valid_0's binary_logloss: 0.0370099
[242]	valid_0's binary_logloss: 0.03701
[243]	valid_0's binary_logloss: 0.037009
[244]	valid_0's binary_logloss: 0.0370079
[245]	valid_0's binary_logloss: 0.0370068
[246]	valid_0's binary_logloss: 0.0370073
[247]	valid_0's binary_logloss: 0.0370069
[248]	valid_0's binary_logloss: 0.0370064
[249]	valid_0's binary_logloss: 0.0370066
[250]	valid_0's binary_logloss: 0.037006
[251]	valid_0's binary_logloss: 0.037005
[252]	valid_0's binary_logloss: 0.0370046
[253]	valid_0's binary_logloss: 0.0370048
[254]	valid_0's binary_logloss: 0.0370042
[255]	valid_0's binary_logloss: 0.0370037
[256]	valid_0's binary_logloss: 0.0370037
[257]	valid_0's binary_logloss: 0.037003
[258]	valid_0's binary_logloss: 0.037002

[431]	valid_0's binary_logloss: 0.0369253
[432]	valid_0's binary_logloss: 0.0369249
[433]	valid_0's binary_logloss: 0.0369247
[434]	valid_0's binary_logloss: 0.0369255
[435]	valid_0's binary_logloss: 0.0369256
[436]	valid_0's binary_logloss: 0.0369256
[437]	valid_0's binary_logloss: 0.0369254
[438]	valid_0's binary_logloss: 0.036924
[439]	valid_0's binary_logloss: 0.0369241
[440]	valid_0's binary_logloss: 0.0369242
[441]	valid_0's binary_logloss: 0.0369234
[442]	valid_0's binary_logloss: 0.0369231
[443]	valid_0's binary_logloss: 0.0369223
[444]	valid_0's binary_logloss: 0.0369227
[445]	valid_0's binary_logloss: 0.0369229
[446]	valid_0's binary_logloss: 0.0369225
[447]	valid_0's binary_logloss: 0.0369212
[448]	valid_0's binary_logloss: 0.0369206
[449]	valid_0's binary_logloss: 0.0369206
[450]	valid_0's binary_logloss: 0.0369204
[451]	valid_0's binary_logloss: 0.0369201
[452]	valid_0's binary_logloss: 0.0369208
[453]	valid_0's binary_logloss: 0.0369204
[454]	valid_0's binary_logloss: 0.0

[150]	valid_0's binary_logloss: 0.538202
[151]	valid_0's binary_logloss: 0.538147
[152]	valid_0's binary_logloss: 0.538102
[153]	valid_0's binary_logloss: 0.538031
[154]	valid_0's binary_logloss: 0.537989
[155]	valid_0's binary_logloss: 0.537954
[156]	valid_0's binary_logloss: 0.537903
[157]	valid_0's binary_logloss: 0.537855
[158]	valid_0's binary_logloss: 0.537841
[159]	valid_0's binary_logloss: 0.537809
[160]	valid_0's binary_logloss: 0.537771
[161]	valid_0's binary_logloss: 0.537728
[162]	valid_0's binary_logloss: 0.537697
[163]	valid_0's binary_logloss: 0.537714
[164]	valid_0's binary_logloss: 0.537672
[165]	valid_0's binary_logloss: 0.537627
[166]	valid_0's binary_logloss: 0.537595
[167]	valid_0's binary_logloss: 0.537539
[168]	valid_0's binary_logloss: 0.537503
[169]	valid_0's binary_logloss: 0.537463
[170]	valid_0's binary_logloss: 0.537409
[171]	valid_0's binary_logloss: 0.53738
[172]	valid_0's binary_logloss: 0.537343
[173]	valid_0's binary_logloss: 0.537299
[174]	valid_0's b

[351]	valid_0's binary_logloss: 0.534411
[352]	valid_0's binary_logloss: 0.534398
[353]	valid_0's binary_logloss: 0.534413
[354]	valid_0's binary_logloss: 0.534399
[355]	valid_0's binary_logloss: 0.534387
[356]	valid_0's binary_logloss: 0.534377
[357]	valid_0's binary_logloss: 0.53437
[358]	valid_0's binary_logloss: 0.534355
[359]	valid_0's binary_logloss: 0.534348
[360]	valid_0's binary_logloss: 0.534343
[361]	valid_0's binary_logloss: 0.534331
[362]	valid_0's binary_logloss: 0.534285
[363]	valid_0's binary_logloss: 0.534276
[364]	valid_0's binary_logloss: 0.53427
[365]	valid_0's binary_logloss: 0.534258
[366]	valid_0's binary_logloss: 0.534251
[367]	valid_0's binary_logloss: 0.534267
[368]	valid_0's binary_logloss: 0.534279
[369]	valid_0's binary_logloss: 0.53428
[370]	valid_0's binary_logloss: 0.534272
[371]	valid_0's binary_logloss: 0.534266
[372]	valid_0's binary_logloss: 0.534251
[373]	valid_0's binary_logloss: 0.534241
[374]	valid_0's binary_logloss: 0.534233
[375]	valid_0's bin

[552]	valid_0's binary_logloss: 0.533046
[553]	valid_0's binary_logloss: 0.533043
[554]	valid_0's binary_logloss: 0.533037
[555]	valid_0's binary_logloss: 0.533031
[556]	valid_0's binary_logloss: 0.533033
[557]	valid_0's binary_logloss: 0.533027
[558]	valid_0's binary_logloss: 0.53302
[559]	valid_0's binary_logloss: 0.533007
[560]	valid_0's binary_logloss: 0.533005
[561]	valid_0's binary_logloss: 0.533002
[562]	valid_0's binary_logloss: 0.532996
[563]	valid_0's binary_logloss: 0.532989
[564]	valid_0's binary_logloss: 0.532987
[565]	valid_0's binary_logloss: 0.532996
[566]	valid_0's binary_logloss: 0.532991
[567]	valid_0's binary_logloss: 0.532988
[568]	valid_0's binary_logloss: 0.532984
[569]	valid_0's binary_logloss: 0.532979
[570]	valid_0's binary_logloss: 0.532976
[571]	valid_0's binary_logloss: 0.532977
[572]	valid_0's binary_logloss: 0.532971
[573]	valid_0's binary_logloss: 0.532963
[574]	valid_0's binary_logloss: 0.532957
[575]	valid_0's binary_logloss: 0.532949
[576]	valid_0's b

In [37]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [38]:
from sklearn.metrics import precision_recall_curve, auc, log_loss
from sklearn.metrics import average_precision_score

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def compute_AP(pred, gt):
  return average_precision_score(gt, pred)

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [39]:
%%time
txt = ''
for i in range(4):
    ap = compute_AP(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} AP:{ap:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply_timestamp      AP:0.21591 RCE:20.68662
retweet_timestamp    AP:0.42902 RCE:23.62521
retweet_with_comment_timestamp AP:0.04663 RCE:10.69715
like_timestamp       AP:0.72792 RCE:21.26861
CPU times: user 825 ms, sys: 6.97 ms, total: 832 ms
Wall time: 808 ms


In [40]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 42.6 minutes


In [41]:
print(f'X_valid.shape {oof.shape}')

X_valid.shape (446420, 4)
