In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

from datetime import datetime

In [2]:
files = glob.glob('/raid/recsys2021_pre_validXGB_TE/*.parquet')

In [3]:
files

['/raid/recsys2021_pre_validXGB_TE/2_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/3_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/4_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/1_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/5_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/6_part-00000_1.parquet',
 '/raid/recsys2021_pre_validXGB_TE/0_part-00000_1.parquet']

In [4]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [5]:
import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

In [6]:
cluster = LocalCUDACluster(local_directory='/raid/dask9432236/', device_memory_limit=0.4)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:43825  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [7]:
df = dask_cudf.read_parquet(sorted(files))

In [8]:
quantiles = [240,  588, 1331, 3996]

df['group'] = 0
for i, quant in enumerate(quantiles):
    df['group'] = (df['group']+(df['a_follower_count']>quant).astype('int8')).astype('int8')

In [9]:
fold = 0
train = df[~(df['folds']==fold)]
valid = df[(df['folds']==fold)]
del df

In [10]:
train, valid = dask.persist(train, valid)

In [11]:
print(train.shape[0].compute(), valid.shape[0].compute())

11569644 2892116


In [12]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['timestamp','a_account_creation','b_account_creation','engage_time', 'folds',
            'fold','b_user_id','a_user_id', 'a_account_creation', 
            'b_account_creation', 'elapsed_time', 'links','domains','hashtags','id', 'date', 'is_train', 
            'tw_original_http0', 'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
            'tw_rt_count_char', 'tw_rt_count_words', 'tw_rt_user0', 'tw_tweet', 'tw_word0',
            'tw_word1', 'tw_word2', 'tw_word3', 'tw_word4', 'tw_count_hash', 'dt_second']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV = list(set(RMV))

In [13]:
%%time

train = train.drop(RMV,axis=1)
wait(train)

CPU times: user 23.4 ms, sys: 1.67 ms, total: 25.1 ms
Wall time: 23.7 ms


DoneAndNotDoneFutures(done={<Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 6)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 4)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 0)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 2)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 1)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 5)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-a35b30f41b85e0475175c6ac82bdbfea', 3)>}, not_done=set())

In [14]:
%%time

valid = valid.drop(RMV,axis=1)
wait(valid)

CPU times: user 20.1 ms, sys: 2.9 ms, total: 23 ms
Wall time: 21.5 ms


DoneAndNotDoneFutures(done={<Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 2)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 5)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 0)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 6)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 1)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 3)>, <Future: finished, type: cudf.DataFrame, key: ('getitem-1d6ffe1f3b9cadf7c8849f1c0d1282d8', 4)>}, not_done=set())

In [15]:
Y_train = train[label_names]
Y_train, = dask.persist(Y_train)
Y_train.head()    
    
train = train.drop(['tweet_id']+label_names,axis=1)
train, = dask.persist(train)
train.head()


features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

Using 115 features: 115


array(['BNN_like', 'BNN_reply', 'BNN_retweet', 'BNN_retweet_comment',
       'BONN_like', 'BONN_reply', 'BONN_retweet', 'BONN_retweet_comment',
       'CE_valid_b_user_id', 'CNN_like', 'CNN_reply', 'CNN_retweet',
       'CNN_retweet_comment', 'CXGB_like', 'CXGB_reply', 'CXGB_retweet',
       'CXGB_retweet_comment', 'GXGB_like', 'GXGB_reply', 'GXGB_retweet',
       'GXGB_retweet_comment', 'TE_a_user_id_like', 'TE_a_user_id_reply',
       'TE_a_user_id_retweet', 'TE_a_user_id_retweet_comment',
       'TE_b_is_verified_tweet_type_like',
       'TE_b_is_verified_tweet_type_reply',
       'TE_b_is_verified_tweet_type_retweet',
       'TE_b_is_verified_tweet_type_retweet_comment',
       'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply',
       'TE_b_user_id_a_user_id_retweet',
       'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like',
       'TE_b_user_id_reply', 'TE_b_user_id_retweet',
       'TE_b_user_id_retweet_comment',
       'TE_b_user_id_tweet_type_language_li

In [16]:
Y_valid = valid[label_names]
Y_valid, = dask.persist(Y_valid)
Y_valid.head()    
    
valid = valid.drop(['tweet_id']+label_names,axis=1)
valid, = dask.persist(valid)
valid.head()


features = [c for c in valid.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),valid.shape[1])
np.asarray(features)

Using 115 features: 115


array(['BNN_like', 'BNN_reply', 'BNN_retweet', 'BNN_retweet_comment',
       'BONN_like', 'BONN_reply', 'BONN_retweet', 'BONN_retweet_comment',
       'CE_valid_b_user_id', 'CNN_like', 'CNN_reply', 'CNN_retweet',
       'CNN_retweet_comment', 'CXGB_like', 'CXGB_reply', 'CXGB_retweet',
       'CXGB_retweet_comment', 'GXGB_like', 'GXGB_reply', 'GXGB_retweet',
       'GXGB_retweet_comment', 'TE_a_user_id_like', 'TE_a_user_id_reply',
       'TE_a_user_id_retweet', 'TE_a_user_id_retweet_comment',
       'TE_b_is_verified_tweet_type_like',
       'TE_b_is_verified_tweet_type_reply',
       'TE_b_is_verified_tweet_type_retweet',
       'TE_b_is_verified_tweet_type_retweet_comment',
       'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply',
       'TE_b_user_id_a_user_id_retweet',
       'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like',
       'TE_b_user_id_reply', 'TE_b_user_id_retweet',
       'TE_b_user_id_retweet_comment',
       'TE_b_user_id_tweet_type_language_li

In [17]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor' : 'gpu_predictor'
}

XGB Version 1.1.0


In [18]:
for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
    if valid[col].dtype=='bool':
        valid[col] = valid[col].astype('int8')
train, = dask.persist(train)
valid, = dask.persist(valid)
train.head()

Unnamed: 0,BNN_like,BNN_reply,BNN_retweet,BNN_retweet_comment,BONN_like,BONN_reply,BONN_retweet,BONN_retweet_comment,CE_valid_b_user_id,CNN_like,...,tw_last_quest,tw_len_gif,tw_len_media,tw_len_photo,tw_len_quest,tw_len_retweet,tw_len_rt,tw_len_token,tw_len_video,tweet_type
0,0.19756,0.000592,0.077181,0.001297,0.191536,0.001118,0.056649,0.002616,2,0.162896,...,0,0,1,1,0,0,0,72,0,1
1,0.346887,0.001559,0.152914,0.009203,0.18148,0.001696,0.36799,0.01589,1,0.417949,...,0,0,0,0,0,0,0,21,0,1
2,0.513477,0.009458,0.011105,0.002888,0.555044,0.015418,0.006097,0.000746,2,0.583672,...,0,0,0,0,0,0,0,65,0,0
3,0.089164,0.002034,0.332557,0.004156,0.111137,0.002138,0.428232,0.007026,1,0.120333,...,0,0,2,2,0,0,0,69,0,1
4,0.115686,0.00223,0.030737,0.003861,0.794731,0.010645,0.080189,0.012284,13,0.114635,...,0,0,0,0,0,0,0,35,0,1


In [19]:
models_index = [0,1,2,3]

In [20]:
def get_colnames(train, name):
    cols = list(train.columns)
    cols = [col for col in cols if col!='group' and cols!='quantile']
    cols = [col for col in cols if 'NN_' not in col and 'XGB' not in col]
    cols = [col for col in cols if 'a_follows_b' not in col]
    cols = [col for col in cols if 'TE_valid' not in col]
    cols = [col for col in cols if '_switch_' not in col]
    cols = cols + ['CNN_' + target for target in label_names]
    cols = cols + ['CXGB_' + target for target in label_names]
    cols = cols + ['BONN_' + target for target in label_names]
    cols = cols + ['XGB_' + target for target in label_names]
    cols = cols + ['BNN_' + target for target in label_names]
    cols = cols + ['GXGB_' + target for target in label_names]
    cols = [col for col in cols if col not in [
        'TE_valid_tweet_id_like', 'TE_valid_tweet_id_reply',
        'TE_valid_tweet_id_retweet', 'TE_valid_tweet_id_retweet_comment',
        'CE_a_user_id', 'CE_b_user_id', 'CE_switch_a_user_id',
        'CE_switch_b_user_id', 'CE_switch_valid_a_user_id',
        'CE_switch_valid_b_user_id', 'CE_valid_a_user_id',
        'CE_valid_b_user_id', 'CE_valid_tweet_id', 'b_timestamp_-1', 'b_timestamp_1',
    ]]
    cols = cols + [
        'TE_valid_a_user_id_like', 'TE_valid_a_user_id_reply',
        'TE_valid_a_user_id_retweet', 'TE_valid_a_user_id_retweet_comment',
        'TE_valid_b_user_id_like', 'TE_valid_b_user_id_reply',
        'TE_valid_b_user_id_retweet', 'TE_valid_b_user_id_retweet_comment',
        'TE_switch_a_user_id_like', 'TE_switch_a_user_id_reply',
        'TE_switch_a_user_id_retweet', 'TE_switch_a_user_id_retweet_comment', 
        'TE_switch_b_user_id_like', 'TE_switch_b_user_id_reply', 
        'TE_switch_b_user_id_retweet', 'TE_switch_b_user_id_retweet_comment',
        'TE_valid_tweet_id_like', 'TE_valid_tweet_id_reply',
        'TE_valid_tweet_id_retweet', 'TE_valid_tweet_id_retweet_comment',
        'CE_valid_b_user_id',
    ]
    return(cols)

In [21]:
cols = get_colnames(train, 'retweet')

In [22]:
cols

['TE_a_user_id_like',
 'TE_a_user_id_reply',
 'TE_a_user_id_retweet',
 'TE_a_user_id_retweet_comment',
 'TE_b_is_verified_tweet_type_like',
 'TE_b_is_verified_tweet_type_reply',
 'TE_b_is_verified_tweet_type_retweet',
 'TE_b_is_verified_tweet_type_retweet_comment',
 'TE_b_user_id_a_user_id_like',
 'TE_b_user_id_a_user_id_reply',
 'TE_b_user_id_a_user_id_retweet',
 'TE_b_user_id_a_user_id_retweet_comment',
 'TE_b_user_id_like',
 'TE_b_user_id_reply',
 'TE_b_user_id_retweet',
 'TE_b_user_id_retweet_comment',
 'TE_b_user_id_tweet_type_language_like',
 'TE_b_user_id_tweet_type_language_reply',
 'TE_b_user_id_tweet_type_language_retweet',
 'TE_b_user_id_tweet_type_language_retweet_comment',
 'TE_tw_original_user0_tweet_type_language_like',
 'TE_tw_original_user0_tweet_type_language_reply',
 'TE_tw_original_user0_tweet_type_language_retweet',
 'TE_tw_original_user0_tweet_type_language_retweet_comment',
 'TE_tw_original_user1_tweet_type_language_like',
 'TE_tw_original_user1_tweet_type_langua

In [23]:
cols

['TE_a_user_id_like',
 'TE_a_user_id_reply',
 'TE_a_user_id_retweet',
 'TE_a_user_id_retweet_comment',
 'TE_b_is_verified_tweet_type_like',
 'TE_b_is_verified_tweet_type_reply',
 'TE_b_is_verified_tweet_type_retweet',
 'TE_b_is_verified_tweet_type_retweet_comment',
 'TE_b_user_id_a_user_id_like',
 'TE_b_user_id_a_user_id_reply',
 'TE_b_user_id_a_user_id_retweet',
 'TE_b_user_id_a_user_id_retweet_comment',
 'TE_b_user_id_like',
 'TE_b_user_id_reply',
 'TE_b_user_id_retweet',
 'TE_b_user_id_retweet_comment',
 'TE_b_user_id_tweet_type_language_like',
 'TE_b_user_id_tweet_type_language_reply',
 'TE_b_user_id_tweet_type_language_retweet',
 'TE_b_user_id_tweet_type_language_retweet_comment',
 'TE_tw_original_user0_tweet_type_language_like',
 'TE_tw_original_user0_tweet_type_language_reply',
 'TE_tw_original_user0_tweet_type_language_retweet',
 'TE_tw_original_user0_tweet_type_language_retweet_comment',
 'TE_tw_original_user1_tweet_type_language_like',
 'TE_tw_original_user1_tweet_type_langua

In [24]:
!rm -r models_stacked_r0
!mkdir models_stacked_r0

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
rm: cannot remove 'models_stacked_r0': No such file or directory
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [25]:
import pickle

In [26]:
%%time

import time

NROUND = 601
VERBOSE_EVAL = 50
ESR = 50
preds_out = []
best_trees_out = []
best_trees = []

for i in models_index:
    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
    cols = get_colnames(train, name)
    print(cols)
       
    start = time.time(); print('Creating DMatrix...')
    preds = []

    dtrain = xgb.dask.DaskDMatrix(client,data=train[cols].values,label=Y_train.iloc[:, i])
    dvalid = xgb.dask.DaskDMatrix(client,data=valid[cols].values,label=Y_valid.iloc[:, i])
    start = time.time(); print('Training...')
    model = xgb.dask.train(client, xgb_parms, 
                           dtrain=dtrain,
                           evals=[(dvalid,'valid')],
                           num_boost_round=NROUND,
                           early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL
                          )
    print('Took %.1f seconds'%(time.time()-start))
    logloss_valid = model["history"]['valid']['logloss'][::VERBOSE_EVAL]
    for ik in range(len(logloss_valid)):
        print(f"{str(VERBOSE_EVAL*ik).zfill(4)} valid-logloss:{logloss_valid[ik]:.5f}")
    for groupid in [0,1,2,3,4]:
        preds.append(xgb.dask.predict(client,model,valid[valid['group']==groupid][cols]))
    start = time.time(); print('Predicting...')
    print('Took %.1f seconds'%(time.time()-start))
    best_trees.append(model['booster'].best_ntree_limit)
    pickle.dump(model, open('./models_stacked_r0/model_' + str(name) + '.pickle', 'wb'))
    preds_out.append(preds)

#########################
### reply
#########################
['TE_a_user_id_like', 'TE_a_user_id_reply', 'TE_a_user_id_retweet', 'TE_a_user_id_retweet_comment', 'TE_b_is_verified_tweet_type_like', 'TE_b_is_verified_tweet_type_reply', 'TE_b_is_verified_tweet_type_retweet', 'TE_b_is_verified_tweet_type_retweet_comment', 'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply', 'TE_b_user_id_a_user_id_retweet', 'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like', 'TE_b_user_id_reply', 'TE_b_user_id_retweet', 'TE_b_user_id_retweet_comment', 'TE_b_user_id_tweet_type_language_like', 'TE_b_user_id_tweet_type_language_reply', 'TE_b_user_id_tweet_type_language_retweet', 'TE_b_user_id_tweet_type_language_retweet_comment', 'TE_tw_original_user0_tweet_type_language_like', 'TE_tw_original_user0_tweet_type_language_reply', 'TE_tw_original_user0_tweet_type_language_retweet', 'TE_tw_original_user0_tweet_type_language_retweet_comment', 'TE_tw_original_user1_tweet_type_language_like', 'T

  [<function predict.<locals>.mapped_predict at 0x7f ... titions>, True]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


Predicting...
Took 0.0 seconds
#########################
### retweet
#########################
['TE_a_user_id_like', 'TE_a_user_id_reply', 'TE_a_user_id_retweet', 'TE_a_user_id_retweet_comment', 'TE_b_is_verified_tweet_type_like', 'TE_b_is_verified_tweet_type_reply', 'TE_b_is_verified_tweet_type_retweet', 'TE_b_is_verified_tweet_type_retweet_comment', 'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply', 'TE_b_user_id_a_user_id_retweet', 'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like', 'TE_b_user_id_reply', 'TE_b_user_id_retweet', 'TE_b_user_id_retweet_comment', 'TE_b_user_id_tweet_type_language_like', 'TE_b_user_id_tweet_type_language_reply', 'TE_b_user_id_tweet_type_language_retweet', 'TE_b_user_id_tweet_type_language_retweet_comment', 'TE_tw_original_user0_tweet_type_language_like', 'TE_tw_original_user0_tweet_type_language_reply', 'TE_tw_original_user0_tweet_type_language_retweet', 'TE_tw_original_user0_tweet_type_language_retweet_comment', 'TE_tw_original_us

Training...
Took 59.2 seconds
0000 valid-logloss:0.66989
0050 valid-logloss:0.51557
0100 valid-logloss:0.50993
0150 valid-logloss:0.50755
0200 valid-logloss:0.50628
0250 valid-logloss:0.50572
0300 valid-logloss:0.50509
0350 valid-logloss:0.50463
0400 valid-logloss:0.50432
0450 valid-logloss:0.50408
0500 valid-logloss:0.50389
0550 valid-logloss:0.50371
0600 valid-logloss:0.50361
Predicting...
Took 0.0 seconds
CPU times: user 16.7 s, sys: 4.67 s, total: 21.3 s
Wall time: 3min 14s


In [27]:
best_trees

[273, 436, 184, 601]

In [28]:
%%time

import cupy

oof = []
yquantile = []

for i in range(5):
    yquantile.append(valid[valid['group']==i]['group'].values.compute())

yquantile = cupy.concatenate(yquantile)
    
for preds in preds_out:
    oof.append(cupy.concatenate([cupy.array(i.values.compute()) for i in preds]))

oof = cupy.asarray(oof).T

yvalid = []
for i in range(5):
    yvalid.append(Y_valid[valid['group']==i][label_names].values.compute())

yvalid = cupy.concatenate(yvalid)

CPU times: user 4.02 s, sys: 3.39 s, total: 7.4 s
Wall time: 18.4 s


In [29]:
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score

def display_score(rce,ap):
    print('Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment')
    for i in range(5):
        print(f'{i:9}      ' + \
              ' '.join([f"{ap[engage_type][i]:10.4f}  {rce[engage_type][i]:10.4f}" for engage_type in ['retweet','reply','like','retweet_comment']]))

    print('     Average   ' + ' '.join([f"{np.mean(list(ap[engage_type])):10.4f}  {np.mean(list(rce[engage_type])):10.4f}" for engage_type in ['retweet','reply','like','retweet_comment']]))  
    print(' Sum AP: ' + str(np.sum([np.mean(list(ap[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])))
    print(' Sum RCE: ' + str(np.sum([np.mean(list(rce[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])))

def precision_recall_curve(y_true,y_pred):
    y_true = y_true.astype('float32')
    ids = cupy.argsort(-y_pred) 
    y_true = y_true[ids]
    y_pred = y_pred[ids]
    y_pred = cupy.flip(y_pred,axis=0)

    acc_one = cupy.cumsum(y_true)
    sum_one = cupy.sum(y_true)
    
    precision = cupy.flip(acc_one/cupy.cumsum(cupy.ones(len(y_true))),axis=0)
    precision[:-1] = precision[1:]
    precision[-1] = 1.

    recall = cupy.flip(acc_one/sum_one,axis=0)
    recall[:-1] = recall[1:]
    recall[-1] = 0
    n = (recall==1).sum()
    
    return precision[n-1:],recall[n-1:],y_pred[n:]

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    recall, prec = cupy.asnumpy(recall), cupy.asnumpy(prec)
    prauc = auc(recall, prec)
    return prauc

def log_loss(y_true,y_pred,eps=1e-15, normalize=True, sample_weight=None):
    y_true = y_true.astype('int32')
    y_pred = cupy.clip(y_pred, eps, 1 - eps)
    if y_pred.ndim == 1:
        y_pred = cupy.expand_dims(y_pred, axis=1)
    if y_pred.shape[1] == 1:
        y_pred = cupy.hstack([1 - y_pred, y_pred])

    y_pred /= cupy.sum(y_pred, axis=1, keepdims=True)
    loss = -cupy.log(y_pred)[cupy.arange(y_pred.shape[0]), y_true]
    return _weighted_sum(loss, sample_weight, normalize).item()

def _weighted_sum(sample_score, sample_weight, normalize):
    if normalize:
        return cupy.average(sample_score, weights=sample_weight)
    elif sample_weight is not None:
        return cupy.dot(sample_score, sample_weight)
    else:
        return sample_score.sum()

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [30]:
%%time

txt = ''
rce_output = {}
ap_output = {}
for i, ind in enumerate(models_index):
    prauc_out = []
    rce_out = []
    ap_out = []
    for j in range(5):
        yvalid_tmp = yvalid[yquantile==j][:, i]
        oof_tmp = oof[yquantile==j][:, i]
        prauc = 0
        rce   = compute_rce_fast(oof_tmp, yvalid_tmp).item()
        ap    = average_precision_score(cupy.asnumpy(yvalid_tmp), cupy.asnumpy(oof_tmp))
        prauc_out.append(prauc)
        rce_out.append(rce)
        ap_out.append(ap)
    rce_output[label_names[ind]] = rce_out
    ap_output[label_names[ind]] = ap_out

CPU times: user 4.68 s, sys: 419 ms, total: 5.1 s
Wall time: 4.93 s


In [31]:
%%time
display_score(rce_output, ap_output)

Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.4716     28.8982     0.2439     24.5978     0.7032     21.5241     0.0800     16.9683
        1          0.4520     28.4857     0.2523     25.4400     0.6875     20.5299     0.0756     17.1390
        2          0.4430     28.2129     0.2857     27.1620     0.6809     20.5274     0.0595     15.4267
        3          0.4435     28.2402     0.3041     28.7867     0.6902     21.3869     0.0632     17.5072
        4          0.4369     30.1136     0.2239     26.2124     0.7688     27.3607     0.0667     19.0992
     Average       0.4494     28.7901     0.2620     26.4398     0.7061     22.2658     0.0690     17.2281
 Sum AP: 1.4865149147338435
 Sum RCE: 94.7237908094618
CPU times: user 2.31 ms, sys: 0 ns, total: 2.31 ms
Wall time: 1.78 ms


In [32]:
ap = ap_output
rce = rce_output

In [33]:
print(str(np.sum([np.mean(list(ap[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])) + ',' + str(np.sum([np.mean(list(rce[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])) + ',' + ','.join([f"{np.mean(list(ap[engage_type])):10.4f},  {np.mean(list(rce[engage_type])):10.4f}" for engage_type in ['retweet','reply','like','retweet_comment']]))  
print(' Sum AP: ' + str(np.sum([np.mean(list(ap[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])))
print(' Sum RCE: ' + str(np.sum([np.mean(list(rce[engage_type])) for engage_type in ['retweet','reply','like','retweet_comment']])))



1.4865149147338435,94.7237908094618,    0.4494,     28.7901,    0.2620,     26.4398,    0.7061,     22.2658,    0.0690,     17.2281
 Sum AP: 1.4865149147338435
 Sum RCE: 94.7237908094618


In [34]:
client.close()