In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score
import gc
import glob
from tqdm.notebook import tqdm
import time
import pickle
import xgboost as xgb
print( 'xgb', xgb.__version__ )


def compute_rce_fast(pred, gt):
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    cross_entropy = -(gt*np.log(pred+1e-6) + (1 - gt)*np.log(1 - pred + 1e-6))
    cross_entropy = np.mean(cross_entropy)
    #print( cross_entropy, strawman_cross_entropy )
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

xgb 1.3.3


In [None]:
gc.collect()

dt = pd.read_csv('stacking_models/part-00000_targets', sep='\x01', header=None)
print( dt.shape )
all_features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id_org',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id_org',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    
    'reply',
    'retweet',
    'retweet_comment',
    'like',
]
dt.columns = all_features

# dt['group'] = 0
# dt['group'] = dt['group'] + 1*(dt['a_follower_count']>=240)
# dt['group'] = dt['group'] + 1*(dt['a_follower_count']>=588)
# dt['group'] = dt['group'] + 1*(dt['a_follower_count']>=1331)
# dt['group'] = dt['group'] + 1*(dt['a_follower_count']>=3996)
# dt['group'] = dt['group'].astype(np.int8)

for tgt in ['reply', 'retweet', 'retweet_comment', 'like']:
    dt[tgt] = dt[tgt].fillna(0.)
    dt[tgt] = 1*(dt[tgt]>0)

for tgt in [['reply','retweet','retweet_comment','like']]:
    dt[tgt] = dt[tgt].astype(np.float32)
    
dt = dt[['tweet_id_org', 'b_user_id_org', 'reply', 'retweet', 'retweet_comment', 'like' ]]
# gc.collect()
#dt = dt.sort_values(['tweet_id', 'b_user_id']).reset_index(drop=True)
#print( dt[['reply', 'retweet', 'retweet_comment', 'like']].mean() )
#print( dt.shape )
#gc.collect()
dt.head()

In [3]:
features = ['tweet_id_org', 'b_user_id_org',
    #'reply', 'retweet', 'retweet_comment', 'like', 
    'text',
    'media', 'tweet_type',
    'language', 'timestamp', 
    'a_user_id', 'a_user_id32', 'a_follower_count', 'a_following_count', 'a_is_verified', 'a_account_creation', 
    'b_user_id', 'b_user_id32', 'b_follower_count', 'b_following_count', 'b_is_verified', 'b_account_creation', 'b_follows_a',
    
    'tw_len_token', 'tw_len_media', 'tw_len_photo', 'tw_len_video', 'tw_len_gif',
    'tw_len_quest', 'tw_count_capital_words', 'tw_count_excl_quest_marks',
    'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 'tw_len_retweet',
    'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
    'tw_rt_count_words', 'tw_rt_count_char', 'len_hashtags', 'len_links', 'len_domains', 'decline',
    
    'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
    'tw_rt_user0',
    'tw_word0', 'tw_word1',
    'tw_tweet',
    
    'group', 'dt_day', 'dt_dow', 'dt_minute',
]
gc.collect()

TRAIN = []
for files in glob.glob('stacking_models/part*.parquet'):
    TRAIN.append(pd.read_parquet(files, columns=features))
TRAIN = pd.concat(TRAIN)
print(TRAIN.shape)
print(TRAIN.columns)

(14461760, 53)
Index(['tweet_id_org', 'b_user_id_org', 'text', 'media', 'tweet_type',
       'language', 'timestamp', 'a_user_id', 'a_user_id32', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'a_account_creation', 'b_user_id',
       'b_user_id32', 'b_follower_count', 'b_following_count', 'b_is_verified',
       'b_account_creation', 'b_follows_a', 'tw_len_token', 'tw_len_media',
       'tw_len_photo', 'tw_len_video', 'tw_len_gif', 'tw_len_quest',
       'tw_count_capital_words', 'tw_count_excl_quest_marks',
       'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 'tw_len_retweet',
       'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
       'tw_rt_count_words', 'tw_rt_count_char', 'len_hashtags', 'len_links',
       'len_domains', 'decline', 'tw_original_user0', 'tw_original_user1',
       'tw_original_user2', 'tw_rt_user0', 'tw_word0', 'tw_word1', 'tw_tweet',
       'group', 'dt_day', 'dt_dow', 'dt_minute'],
      dtype='object')


In [4]:
TRAIN = TRAIN.merge(dt, on=['tweet_id_org', 'b_user_id_org'], how='left')
TRAIN.head()

Unnamed: 0,tweet_id_org,b_user_id_org,text,media,tweet_type,language,timestamp,a_user_id,a_user_id32,a_follower_count,...,tw_word1,tw_tweet,group,dt_day,dt_dow,dt_minute,reply,retweet,retweet_comment,like
0,485F9AEEEAC7EBBA551559F8CC54F6BD,52F5145EACA1BFFB3C27A15B5BBF1B80,[unk] @officialbhafc have gone unbeaten in six...,1,0,0,1614463799,-6764329288916620177,-967136145,25133420,...,-1566630340,-855221262,4,27,5,1329,0.0,0.0,0.0,0.0
1,812D8339F3A17029895574D676B3C318,8314F96484A42129EDF2B3FAA47AE1D6,น่ากิงงงงงงง httpauux1z6zmt,1,0,6,1614758151,-4743724345997035831,880375497,13720,...,953202288,680269372,4,31,2,475,0.0,0.0,0.0,0.0
2,0885AF9FBF48594DDE83C72ED36EE626,335546E0A79EB2C3161CC9671E8D7039,bonne st. valentin à tous les amoureux. #livei...,1,1,9,1614588841,8006651453309039477,1658434421,2231,...,1804504191,-808892678,3,29,0,534,0.0,0.0,0.0,0.0
3,CDEC74B963F22F251CB077B26D5A1F3D,5956FADAD7AEE62F02E09DD062097ACA,proof that fear is a learned behavior. httplst...,1,1,0,1614293543,-3093655680362614616,-1298272088,1678,...,-340833703,1330856576,3,25,3,1372,0.0,0.0,0.0,0.0
4,18C2A7C51CA75D7858A781BA3982DF8E,3D1E5AAA0BE2D98307D95A448C57F004,2021 / 3 / 1 ( 月 ) から 、 『 逆 転 オセロニア 』 をお 楽 しみい...,1,0,1,1614394801,7257328485927712599,-1777633449,272256,...,-108567334,1712091271,4,27,5,180,0.0,0.0,0.0,0.0


In [5]:
targets = ['reply', 'retweet', 'retweet_comment', 'like']
TRAIN.groupby('group')[targets].agg('mean')

Unnamed: 0_level_0,reply,retweet,retweet_comment,like
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.036779,0.09614,0.006504,0.393955
1,0.031935,0.087023,0.005803,0.383293
2,0.030443,0.08282,0.005667,0.372757
3,0.027556,0.086349,0.00587,0.37345
4,0.015568,0.076758,0.005328,0.412088


In [6]:
TRAIN.head()

Unnamed: 0,tweet_id_org,b_user_id_org,text,media,tweet_type,language,timestamp,a_user_id,a_user_id32,a_follower_count,...,tw_word1,tw_tweet,group,dt_day,dt_dow,dt_minute,reply,retweet,retweet_comment,like
0,485F9AEEEAC7EBBA551559F8CC54F6BD,52F5145EACA1BFFB3C27A15B5BBF1B80,[unk] @officialbhafc have gone unbeaten in six...,1,0,0,1614463799,-6764329288916620177,-967136145,25133420,...,-1566630340,-855221262,4,27,5,1329,0.0,0.0,0.0,0.0
1,812D8339F3A17029895574D676B3C318,8314F96484A42129EDF2B3FAA47AE1D6,น่ากิงงงงงงง httpauux1z6zmt,1,0,6,1614758151,-4743724345997035831,880375497,13720,...,953202288,680269372,4,31,2,475,0.0,0.0,0.0,0.0
2,0885AF9FBF48594DDE83C72ED36EE626,335546E0A79EB2C3161CC9671E8D7039,bonne st. valentin à tous les amoureux. #livei...,1,1,9,1614588841,8006651453309039477,1658434421,2231,...,1804504191,-808892678,3,29,0,534,0.0,0.0,0.0,0.0
3,CDEC74B963F22F251CB077B26D5A1F3D,5956FADAD7AEE62F02E09DD062097ACA,proof that fear is a learned behavior. httplst...,1,1,0,1614293543,-3093655680362614616,-1298272088,1678,...,-340833703,1330856576,3,25,3,1372,0.0,0.0,0.0,0.0
4,18C2A7C51CA75D7858A781BA3982DF8E,3D1E5AAA0BE2D98307D95A448C57F004,2021 / 3 / 1 ( 月 ) から 、 『 逆 転 オセロニア 』 をお 楽 しみい...,1,0,1,1614394801,7257328485927712599,-1777633449,272256,...,-108567334,1712091271,4,27,5,180,0.0,0.0,0.0,0.0


In [7]:
files = [
    'results_benny_nn.csv',
    'results_benny_xgb.csv',
    'results-chris-nn.csv',
    'results-chris-xgb.csv',
    'results-giba-xgb.csv',
    'results_bo.csv',    
]

for n, fn in tqdm(enumerate(files)):
    dt = pd.read_csv('stacking_models/'+fn, header=None)
    dt.columns = ['tweet_id_org', 'b_user_id_org', 'reply_'+str(n), 'retweet_'+str(n), 'retweet_comment_'+str(n), 'like_'+str(n)]
    for tgt in ['reply_'+str(n),'retweet_'+str(n),'retweet_comment_'+str(n),'like_'+str(n)]:
        dt[tgt] = dt[tgt].astype(np.float32)
    TRAIN = TRAIN.merge(dt, on=['tweet_id_org', 'b_user_id_org'], how='left')
    print(dt.shape, TRAIN.shape)
    del dt
    _ = gc.collect()

0it [00:00, ?it/s]

(14461760, 6) (14461760, 61)
(14461760, 6) (14461760, 65)
(14461760, 6) (14461760, 69)
(14461760, 6) (14461760, 73)
(14461760, 6) (14461760, 77)
(14461760, 6) (14461760, 81)


In [8]:
TRAIN.head()

Unnamed: 0,tweet_id_org,b_user_id_org,text,media,tweet_type,language,timestamp,a_user_id,a_user_id32,a_follower_count,...,retweet_comment_3,like_3,reply_4,retweet_4,retweet_comment_4,like_4,reply_5,retweet_5,retweet_comment_5,like_5
0,485F9AEEEAC7EBBA551559F8CC54F6BD,52F5145EACA1BFFB3C27A15B5BBF1B80,[unk] @officialbhafc have gone unbeaten in six...,1,0,0,1614463799,-6764329288916620177,-967136145,25133420,...,0.02026,0.314481,0.011782,0.059517,0.03205,0.289015,0.033257,0.100067,0.02299,0.805515
1,812D8339F3A17029895574D676B3C318,8314F96484A42129EDF2B3FAA47AE1D6,น่ากิงงงงงงง httpauux1z6zmt,1,0,6,1614758151,-4743724345997035831,880375497,13720,...,0.001571,0.052043,0.026442,0.609299,0.002144,0.118176,0.004492,0.141073,0.002525,0.052876
2,0885AF9FBF48594DDE83C72ED36EE626,335546E0A79EB2C3161CC9671E8D7039,bonne st. valentin à tous les amoureux. #livei...,1,1,9,1614588841,8006651453309039477,1658434421,2231,...,0.002261,0.063515,0.001034,0.103616,0.004875,0.025641,0.000876,0.048602,0.00146,0.026797
3,CDEC74B963F22F251CB077B26D5A1F3D,5956FADAD7AEE62F02E09DD062097ACA,proof that fear is a learned behavior. httplst...,1,1,0,1614293543,-3093655680362614616,-1298272088,1678,...,0.001246,0.119232,0.003325,0.030734,0.003857,0.047097,0.004021,0.097551,0.006045,0.176319
4,18C2A7C51CA75D7858A781BA3982DF8E,3D1E5AAA0BE2D98307D95A448C57F004,2021 / 3 / 1 ( 月 ) から 、 『 逆 転 オセロニア 』 をお 楽 しみい...,1,0,1,1614394801,7257328485927712599,-1777633449,272256,...,0.002294,0.332503,0.003609,0.00806,0.001763,0.217295,0.006163,0.011044,0.001584,0.266289


In [9]:
TRAIN.to_parquet('stacking_models/stack.parquet')

In [2]:
TRAIN = pd.read_parquet('stacking_models/stack.parquet')
TRAIN['fold'] = np.arange(TRAIN.shape[0])%5
TRAIN.shape

(14461760, 82)

In [3]:
TRAIN.columns

Index(['tweet_id_org', 'b_user_id_org', 'text', 'media', 'tweet_type',
       'language', 'timestamp', 'a_user_id', 'a_user_id32', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'a_account_creation', 'b_user_id',
       'b_user_id32', 'b_follower_count', 'b_following_count', 'b_is_verified',
       'b_account_creation', 'b_follows_a', 'tw_len_token', 'tw_len_media',
       'tw_len_photo', 'tw_len_video', 'tw_len_gif', 'tw_len_quest',
       'tw_count_capital_words', 'tw_count_excl_quest_marks',
       'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 'tw_len_retweet',
       'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
       'tw_rt_count_words', 'tw_rt_count_char', 'len_hashtags', 'len_links',
       'len_domains', 'decline', 'tw_original_user0', 'tw_original_user1',
       'tw_original_user2', 'tw_rt_user0', 'tw_word0', 'tw_word1', 'tw_tweet',
       'group', 'dt_day', 'dt_dow', 'dt_minute', 'reply', 'retweet',
       'retweet_comment', 'like'

In [14]:
%%time
dt = pd.read_parquet('TEMAPS_LOCAL/te-a_user_id32-b_user_id32_count.parquet')
TRAIN = TRAIN.merge(dt, left_on=['a_user_id32','b_user_id32'], right_index=True, how='left')
gc.collect()

CPU times: user 2min, sys: 16.4 s, total: 2min 16s
Wall time: 2min 9s


110

In [16]:
dt = pd.read_parquet('TEMAPS_LOCAL/a_user_id-tweet_type.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/b_user_id-tweet_type.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/media-language-tweet_type-a_is_verified-b_is_verified-b_follows_a-tw_last_quest-decline-group.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/tw_original_user0-tweet_type.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/tw_original_user1-tweet_type.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/tw_word0-tweet_type.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/tw_rt_user0.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/a_follower_count-a_following_count-b_follower_count-b_following_count-tweet_type-language-b_follows_a.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

dt = pd.read_parquet('TEMAPS_LOCAL/b_user_id.parquet')
features = list(dt.index.names)
TRAIN = TRAIN.merge(dt, left_on=features, right_index=True, how='left')
del dt; gc.collect()
print(TRAIN.shape)

(14461760, 88)
(14461760, 93)
(14461760, 98)
(14461760, 103)
(14461760, 108)
(14461760, 113)
(14461760, 118)
(14461760, 123)
(14461760, 128)


In [17]:
dt = pd.read_parquet('TEMAPS_LOCAL/b_user_id.parquet').reset_index()
dt.columns = ['a_user_id','t0','t1','t2','t3','t4']
TRAIN = TRAIN.merge(dt, on=['a_user_id'], how='left')

TRAIN.head()

Unnamed: 0,tweet_id_org,b_user_id_org,text,media,tweet_type,language,timestamp,a_user_id,a_user_id32,a_follower_count,...,sums_b_user_reply,sums_b_user_retweet,sums_b_user_retweet_comment,sums_b_user_like,counts_b_user_id,t0,t1,t2,t3,t4
0,485F9AEEEAC7EBBA551559F8CC54F6BD,52F5145EACA1BFFB3C27A15B5BBF1B80,[unk] @officialbhafc have gone unbeaten in six...,1,0,0,1614463799,-6764329288916620177,-967136145,25133420,...,0.0,1.0,1.0,3.0,8.0,0.0,1.0,0.0,0.0,1.0
1,812D8339F3A17029895574D676B3C318,8314F96484A42129EDF2B3FAA47AE1D6,น่ากิงงงงงงง httpauux1z6zmt,1,0,6,1614758151,-4743724345997035831,880375497,13720,...,0.0,12.0,0.0,1.0,26.0,0.0,1.0,0.0,0.0,4.0
2,0885AF9FBF48594DDE83C72ED36EE626,335546E0A79EB2C3161CC9671E8D7039,bonne st. valentin à tous les amoureux. #livei...,1,1,9,1614588841,8006651453309039477,1658434421,2231,...,0.0,43.0,1.0,18.0,141.0,3.0,0.0,0.0,2.0,11.0
3,CDEC74B963F22F251CB077B26D5A1F3D,5956FADAD7AEE62F02E09DD062097ACA,proof that fear is a learned behavior. httplst...,1,1,0,1614293543,-3093655680362614616,-1298272088,1678,...,0.0,3.0,0.0,7.0,14.0,0.0,0.0,0.0,2.0,4.0
4,18C2A7C51CA75D7858A781BA3982DF8E,3D1E5AAA0BE2D98307D95A448C57F004,2021 / 3 / 1 ( 月 ) から 、 『 逆 転 オセロニア 』 をお 楽 しみい...,1,0,1,1614394801,7257328485927712599,-1777633449,272256,...,0.0,0.0,0.0,7.0,14.0,,,,,


In [19]:
targets = ['reply', 'retweet', 'retweet_comment', 'like']

features_base = [
    'media', 'tweet_type', 'language', 
    'a_follower_count', 'a_following_count', 'a_is_verified', 'a_account_creation',
    'b_follower_count', 'b_following_count', 'b_is_verified', 'b_account_creation', 'b_follows_a',
    'tw_len_token', 'tw_len_media', 'tw_len_photo', 'tw_len_video', 'tw_len_gif',
    'tw_len_quest', 'tw_count_capital_words', 'tw_count_excl_quest_marks', 'tw_count_special1',
    'tw_count_hash', 'tw_last_quest', 'tw_len_retweet', 'tw_len_rt', 'tw_count_at',
    'tw_count_words', 'tw_count_char',  'tw_rt_count_words', 'tw_rt_count_char',
    'dt_dow',
    'dt_minute',
    'len_hashtags', 'len_links', 'len_domains', 'decline'
]

In [20]:
TRAIN['count1'] = TRAIN.groupby(['a_user_id'])['dt_dow'].transform('count')
TRAIN['count2'] = TRAIN.groupby(['b_user_id'])['dt_dow'].transform('count')
TRAIN['count3'] = TRAIN.groupby(['tw_tweet'])['dt_dow'].transform('count')
TRAIN['count4'] = TRAIN.groupby(['a_user_id','b_user_id'])['dt_dow'].transform('count')

In [32]:
TRAIN.head()

Unnamed: 0,tweet_id_org,b_user_id_org,text,media,tweet_type,language,timestamp,a_user_id,a_user_id32,a_follower_count,...,counts_b_user_id,t0,t1,t2,t3,t4,count1,count2,count3,count4
0,485F9AEEEAC7EBBA551559F8CC54F6BD,52F5145EACA1BFFB3C27A15B5BBF1B80,[unk] @officialbhafc have gone unbeaten in six...,1,0,0,1614463799,-6764329288916620177,-967136145,25133420,...,8.0,0.0,1.0,0.0,0.0,1.0,5218,5,29,1
1,812D8339F3A17029895574D676B3C318,8314F96484A42129EDF2B3FAA47AE1D6,น่ากิงงงงงงง httpauux1z6zmt,1,0,6,1614758151,-4743724345997035831,880375497,13720,...,26.0,0.0,1.0,0.0,0.0,4.0,32,4,1,1
2,0885AF9FBF48594DDE83C72ED36EE626,335546E0A79EB2C3161CC9671E8D7039,bonne st. valentin à tous les amoureux. #livei...,1,1,9,1614588841,8006651453309039477,1658434421,2231,...,141.0,3.0,0.0,0.0,2.0,11.0,1,12,1,1
3,CDEC74B963F22F251CB077B26D5A1F3D,5956FADAD7AEE62F02E09DD062097ACA,proof that fear is a learned behavior. httplst...,1,1,0,1614293543,-3093655680362614616,-1298272088,1678,...,14.0,0.0,0.0,0.0,2.0,4.0,2,3,9,1
4,18C2A7C51CA75D7858A781BA3982DF8E,3D1E5AAA0BE2D98307D95A448C57F004,2021 / 3 / 1 ( 月 ) から 、 『 逆 転 オセロニア 』 をお 楽 しみい...,1,0,1,1614394801,7257328485927712599,-1777633449,272256,...,14.0,,,,,,14,2,3,1


In [33]:
train0 = TRAIN.loc[TRAIN.fold!=0].copy()
train1 = TRAIN.loc[TRAIN.fold==0].copy()

train0.shape, train1.shape

((11569408, 137), (2892352, 137))

In [35]:
feature_s1 = [
    
    'reply_0',
    'retweet_0',
    'retweet_comment_0',
    'like_0',

    'reply_1',
    'retweet_1',
    'retweet_comment_1',
    'like_1',

    'reply_2',
    'retweet_2',
    'retweet_comment_2',
    'like_2',

    'reply_3',
    'retweet_3',
    'retweet_comment_3',
    'like_3',

    'reply_4',
    'retweet_4',
    'retweet_comment_4',
    'like_4',

    'reply_5',
    'retweet_5',
    'retweet_comment_5',
    'like_5',
    
    'count1', 'count2', 'count3', 'count4',
    
   'sums_te_reply-a_user_id-tweet_type',
   'counts_te_reply-a_user_id-tweet_type',
   'sums_te_retweet-a_user_id-tweet_type',
   'sums_te_retweet_comment-a_user_id-tweet_type',
   'sums_te_like-a_user_id-tweet_type',
   'sums_te_reply-b_user_id-tweet_type',
   'counts_te_reply-b_user_id-tweet_type',
   'sums_te_retweet-b_user_id-tweet_type',
   'sums_te_retweet_comment-b_user_id-tweet_type',
   'sums_te_like-b_user_id-tweet_type', 'multi_reply', 'multi_retweet',
   'multi_retweet_comment', 'multi_like', 'multi_counts', 'ouser0_reply',
   'ouser0_retweet', 'ouser0_retweet_comment', 'ouser0_like',
   'ouser0_counts', 'ouser1_reply', 'ouser1_retweet',
   'ouser1_retweet_comment', 'ouser1_like', 'ouser1_counts', 'word_reply',
   'word_retweet', 'word_retweet_comment', 'word_like', 'word_counts',
   'rtuser0_reply', 'rtuser0_retweet', 'rtuser0_retweet_comment',
   'rtuser0_like', 'rtuser0_counts', 'follow_reply', 'follow_retweet',
   'follow_retweet_comment', 'follow_like', 'follow_counts',
   'sums_b_user_reply', 'sums_b_user_retweet',
   'sums_b_user_retweet_comment', 'sums_b_user_like', 'counts_b_user_id',
   't0', 't1', 't2', 't3', 't4',    
    
    'ab_count',
    
]
features = features_base + feature_s1
features

['media',
 'tweet_type',
 'language',
 'a_follower_count',
 'a_following_count',
 'a_is_verified',
 'a_account_creation',
 'b_follower_count',
 'b_following_count',
 'b_is_verified',
 'b_account_creation',
 'b_follows_a',
 'tw_len_token',
 'tw_len_media',
 'tw_len_photo',
 'tw_len_video',
 'tw_len_gif',
 'tw_len_quest',
 'tw_count_capital_words',
 'tw_count_excl_quest_marks',
 'tw_count_special1',
 'tw_count_hash',
 'tw_last_quest',
 'tw_len_retweet',
 'tw_len_rt',
 'tw_count_at',
 'tw_count_words',
 'tw_count_char',
 'tw_rt_count_words',
 'tw_rt_count_char',
 'dt_dow',
 'dt_minute',
 'len_hashtags',
 'len_links',
 'len_domains',
 'decline',
 'reply_0',
 'retweet_0',
 'retweet_comment_0',
 'like_0',
 'reply_1',
 'retweet_1',
 'retweet_comment_1',
 'like_1',
 'reply_2',
 'retweet_2',
 'retweet_comment_2',
 'like_2',
 'reply_3',
 'retweet_3',
 'retweet_comment_3',
 'like_3',
 'reply_4',
 'retweet_4',
 'retweet_comment_4',
 'like_4',
 'reply_5',
 'retweet_5',
 'retweet_comment_5',
 'like_

In [69]:
def calc_rce(df, target='reply'):
    SC = []
    for g in range(5):
        SC.append( compute_rce_fast(df.loc[df.group==g,'y_'+target].values,  df.loc[df.group==g,target].values ) )    
    print(SC)
    return np.mean(SC)

def calc_ap(df, target='reply'):
    SC = []
    for g in range(5):
        SC.append( average_precision_score(df.loc[df.group==g,target].values,  df.loc[df.group==g,'y_'+target].values ) )    
    print(SC)
    return np.mean(SC)

LR = {
    'reply': 0.0375,
    'retweet': 0.063,
    'retweet_comment': 0.021,
    'like': 0.113625,
}

fold = 0

SC1 = []
SC2 = []
for target in ['reply', 'retweet', 'retweet_comment', 'like']:
    print('TARGET:', target )
    xgb_parms = { 
        'max_depth': 9, 
        'learning_rate': LR[target]/1, 
        'subsample':0.50,
        #'sampling_method': 'gradient_based',
        'max_bin': 256,
        'colsample_bytree':0.80,
        'grow_policy ': 'depthwise', #'depthwise', 'lossguide'
        'eval_metric':['logloss','auc'],
        'objective':'binary:logistic',
        'nthread': 1,
        'tree_method':'gpu_hist',
        'predictor' : 'gpu_predictor',
    }

    ROUND = []
    #for group in range(5):
    start = time.time();

    #W = train0['group'].values.copy()
    #grp, cnt = np.unique(W,return_counts=True)
    #cnt = cnt[0] / cnt
    #cnt = np.log1p(cnt)
    #W = cnt[W]
    #ind = train0['positive'].values == 0
    #W[ind] = W[ind]/5.
    #print( np.unique(W, return_counts=True) )

    #FW = np.ones(len(features)) 
    #FW[:30] = 2
    #FW[-5:] = 2
    #FW = FW / 2.
    #print( np.unique(FW, return_counts=True) )

    dtrain = xgb.DMatrix(data=TRAIN[features], label=TRAIN[target] )#, weight=W  )
    #dtrain.set_info(feature_weights=FW)
    dvalid = xgb.DMatrix(data=train1[features], label=train1[target] )#, weight=W  )
    gc.collect();gc.collect();gc.collect();
    print( time.time() - start, 's' )

    start = time.time(); print('Training...')
    model = xgb.train(
        xgb_parms, 
        dtrain=dtrain,
        evals=[(dvalid,'valid')],
        num_boost_round=1000,
        #early_stopping_rounds=100,
        verbose_eval=10,
    )
    train1['y_'+target] = model.predict(dvalid)

    sc1 = calc_rce(train1, target)
    sc2 = calc_ap(train1, target)
    print( 'RCE:', sc1, sc2 )
    SC1.append(sc1)
    SC2.append(sc2)

    #sc2 = compute_rce_fast(ypred1, train1[target].values )
    #ap2 = average_precision_score(train1[target].values, ypred1)
    #SC1.append(sc2)
    #SC2.append(ap2)
    # ROUND.append(model.best_iteration)
    # print( model.best_iteration, 'RCE:', sc1, sc2, ap1, ap2, int(time.time() - start), 's' )
    # print()
    pickle.dump(model, open('giba_xgbmodels/model_' + str(target) + '_' + str(fold) + '.pickle', 'wb'))

    # del dtrain, dvalid2, model
    gc.collect()
    #break

    #print('RCE:', np.mean(SC1), np.mean(SC2) )
    print()
    #print()
print(np.sum(SC1), np.sum(SC2))

TARGET: reply
41.01070284843445 s
Training...
Parameters: { grow_policy  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	valid-logloss:0.65903	valid-auc:0.87341
[10]	valid-logloss:0.42102	valid-auc:0.87938
[20]	valid-logloss:0.28948	valid-auc:0.88024
[30]	valid-logloss:0.21065	valid-auc:0.88096
[40]	valid-logloss:0.16152	valid-auc:0.88141
[50]	valid-logloss:0.13028	valid-auc:0.88190
[60]	valid-logloss:0.11023	valid-auc:0.88261
[70]	valid-logloss:0.09731	valid-auc:0.88315
[80]	valid-logloss:0.08900	valid-auc:0.88369
[90]	valid-logloss:0.08367	valid-auc:0.88431
[100]	valid-logloss:0.08026	valid-auc:0.88505
[110]	valid-logloss:0.07808	valid-auc:0.88572
[120]	valid-logloss:0.07669	valid-auc:0.88646
[130]	valid-logloss:0.07580	valid-auc:0.88716
[140]	valid-logloss:0.07521	valid

[540]	valid-logloss:0.19141	valid-auc:0.88571
[550]	valid-logloss:0.19126	valid-auc:0.88595
[560]	valid-logloss:0.19112	valid-auc:0.88617
[570]	valid-logloss:0.19099	valid-auc:0.88640
[580]	valid-logloss:0.19085	valid-auc:0.88661
[590]	valid-logloss:0.19073	valid-auc:0.88680
[600]	valid-logloss:0.19062	valid-auc:0.88701
[610]	valid-logloss:0.19049	valid-auc:0.88722
[620]	valid-logloss:0.19037	valid-auc:0.88740
[630]	valid-logloss:0.19025	valid-auc:0.88760
[640]	valid-logloss:0.19012	valid-auc:0.88781
[650]	valid-logloss:0.18999	valid-auc:0.88802
[660]	valid-logloss:0.18986	valid-auc:0.88822
[670]	valid-logloss:0.18974	valid-auc:0.88841
[680]	valid-logloss:0.18963	valid-auc:0.88860
[690]	valid-logloss:0.18952	valid-auc:0.88879
[700]	valid-logloss:0.18939	valid-auc:0.88900
[710]	valid-logloss:0.18927	valid-auc:0.88918
[720]	valid-logloss:0.18916	valid-auc:0.88937
[730]	valid-logloss:0.18905	valid-auc:0.88954
[740]	valid-logloss:0.18894	valid-auc:0.88972
[750]	valid-logloss:0.18881	valid-

[10]	valid-logloss:0.55400	valid-auc:0.79548
[20]	valid-logloss:0.53174	valid-auc:0.80002
[30]	valid-logloss:0.52466	valid-auc:0.80326
[40]	valid-logloss:0.52080	valid-auc:0.80593
[50]	valid-logloss:0.51840	valid-auc:0.80779
[60]	valid-logloss:0.51688	valid-auc:0.80904
[70]	valid-logloss:0.51558	valid-auc:0.81012
[80]	valid-logloss:0.51452	valid-auc:0.81100
[90]	valid-logloss:0.51353	valid-auc:0.81183
[100]	valid-logloss:0.51274	valid-auc:0.81251
[110]	valid-logloss:0.51199	valid-auc:0.81316
[120]	valid-logloss:0.51130	valid-auc:0.81374
[130]	valid-logloss:0.51075	valid-auc:0.81420
[140]	valid-logloss:0.51010	valid-auc:0.81474
[150]	valid-logloss:0.50963	valid-auc:0.81513
[160]	valid-logloss:0.50915	valid-auc:0.81553
[170]	valid-logloss:0.50865	valid-auc:0.81597
[180]	valid-logloss:0.50813	valid-auc:0.81640
[190]	valid-logloss:0.50766	valid-auc:0.81679
[200]	valid-logloss:0.50723	valid-auc:0.81717
[210]	valid-logloss:0.50681	valid-auc:0.81751
[220]	valid-logloss:0.50645	valid-auc:0.817

In [70]:
imp = pd.DataFrame().from_dict(model.get_score(importance_type='gain'), orient='index').sort_values(0, ascending=False).reset_index()
imp.head(40)

Unnamed: 0,index,0
0,like_3,684.461742
1,like_0,171.301767
2,count4,99.360281
3,like_2,79.623487
4,count2,73.22849
5,like_5,73.197987
6,like_1,57.194317
7,like_4,48.860465
8,count3,48.495351
9,tw_rt_count_char,39.580495


In [71]:
imp.tail(20)

Unnamed: 0,index,0
95,retweet_comment_3,9.336818
96,reply_3,9.334119
97,ouser1_counts,9.281841
98,retweet_comment_0,9.251026
99,sums_te_reply-b_user_id-tweet_type,9.197635
100,tw_count_words,9.136538
101,tw_count_excl_quest_marks,9.120641
102,tw_count_special1,9.080822
103,sums_b_user_retweet_comment,9.016255
104,retweet_comment_2,8.981002
